diff --git a/.gitignore b/.gitignore
index 28f2aca8..5a2ad423 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,7 @@ LOCK
 LOG*
 CURRENT
 MANIFEST-*
+
+#cmakefiles
+src/caffe/test/CMakeFiles
+src/caffe/CMakeFiles
diff --git a/LICENSE b/LICENSE
index d69d16f5..ca91d911 100644
--- a/LICENSE
+++ b/LICENSE
@@ -42,3 +42,9 @@ CONTRIBUTION AGREEMENT
 By contributing to the BVLC/caffe repository through pull-request, comment,
 or otherwise, the contributor releases their content to the
 license and copyright terms herein.
+
+AMD license on the OpenCL parts
+
+AMD holds license for the OpenCL related code, kernels and optimizations. 
+AMD license is added to the file or part of the file that written by AMD.
+For details, please see license declaration for individual file.
diff --git a/Makefile b/Makefile
index 05b783af..905a19c3 100644
--- a/Makefile
+++ b/Makefile
@@ -38,13 +38,10 @@ DYNAMIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).so
 ##############################
 # CXX_SRCS are the source files excluding the test ones.
 CXX_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cpp" -name "*.cpp")
-# CU_SRCS are the cuda source files
-CU_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cu" -name "*.cu")
 # TEST_SRCS are the test source files
 TEST_MAIN_SRC := src/$(PROJECT)/test/test_caffe_main.cpp
 TEST_SRCS := $(shell find src/$(PROJECT) -name "test_*.cpp")
 TEST_SRCS := $(filter-out $(TEST_MAIN_SRC), $(TEST_SRCS))
-TEST_CU_SRCS := $(shell find src/$(PROJECT) -name "test_*.cu")
 GTEST_SRC := src/gtest/gtest-all.cpp
 # TOOL_SRCS are the source files for the tool binaries
 TOOL_SRCS := $(shell find tools -name "*.cpp")
@@ -68,7 +65,7 @@ NONGEN_CXX_SRCS := $(shell find \
 	matlab/+$(PROJECT)/private \
 	examples \
 	tools \
-	-name "*.cpp" -or -name "*.hpp" -or -name "*.cu" -or -name "*.cuh")
+	-name "*.cpp" -or -name "*.hpp")
 LINT_SCRIPT := scripts/cpp_lint.py
 LINT_OUTPUT_DIR := $(BUILD_DIR)/.lint
 LINT_EXT := lint.txt
@@ -103,22 +100,19 @@ PROTO_GEN_PY := $(foreach file,${PROTO_SRCS:.proto=_pb2.py}, \
 # These objects will be linked into the final shared library, so we
 # exclude the tool, example, and test objects.
 CXX_OBJS := $(addprefix $(BUILD_DIR)/, ${CXX_SRCS:.cpp=.o})
-CU_OBJS := $(addprefix $(BUILD_DIR)/cuda/, ${CU_SRCS:.cu=.o})
 PROTO_OBJS := ${PROTO_GEN_CC:.cc=.o}
-OBJS := $(PROTO_OBJS) $(CXX_OBJS) $(CU_OBJS)
+OBJS := $(PROTO_OBJS) $(CXX_OBJS) 
 # tool, example, and test objects
 TOOL_OBJS := $(addprefix $(BUILD_DIR)/, ${TOOL_SRCS:.cpp=.o})
 TOOL_BUILD_DIR := $(BUILD_DIR)/tools
 TEST_CXX_BUILD_DIR := $(BUILD_DIR)/src/$(PROJECT)/test
-TEST_CU_BUILD_DIR := $(BUILD_DIR)/cuda/src/$(PROJECT)/test
 TEST_CXX_OBJS := $(addprefix $(BUILD_DIR)/, ${TEST_SRCS:.cpp=.o})
-TEST_CU_OBJS := $(addprefix $(BUILD_DIR)/cuda/, ${TEST_CU_SRCS:.cu=.o})
-TEST_OBJS := $(TEST_CXX_OBJS) $(TEST_CU_OBJS)
+TEST_OBJS := $(TEST_CXX_OBJS) 
 GTEST_OBJ := $(addprefix $(BUILD_DIR)/, ${GTEST_SRC:.cpp=.o})
 EXAMPLE_OBJS := $(addprefix $(BUILD_DIR)/, ${EXAMPLE_SRCS:.cpp=.o})
 # Output files for automatic dependency generation
-DEPS := ${CXX_OBJS:.o=.d} ${CU_OBJS:.o=.d} ${TEST_CXX_OBJS:.o=.d} \
-	${TEST_CU_OBJS:.o=.d} $(BUILD_DIR)/${MAT$(PROJECT)_SO:.$(MAT_SO_EXT)=.d}
+DEPS := ${CXX_OBJS:.o=.d} ${TEST_CXX_OBJS:.o=.d} \
+	 $(BUILD_DIR)/${MAT$(PROJECT)_SO:.$(MAT_SO_EXT)=.d}
 # tool, example, and test bins
 TOOL_BINS := ${TOOL_OBJS:.o=.bin}
 EXAMPLE_BINS := ${EXAMPLE_OBJS:.o=.bin}
@@ -126,11 +120,9 @@ EXAMPLE_BINS := ${EXAMPLE_OBJS:.o=.bin}
 TOOL_BIN_LINKS := ${TOOL_BINS:.bin=}
 # Put the test binaries in build/test for convenience.
 TEST_BIN_DIR := $(BUILD_DIR)/test
-TEST_CU_BINS := $(addsuffix .testbin,$(addprefix $(TEST_BIN_DIR)/, \
-		$(foreach obj,$(TEST_CU_OBJS),$(basename $(notdir $(obj))))))
 TEST_CXX_BINS := $(addsuffix .testbin,$(addprefix $(TEST_BIN_DIR)/, \
 		$(foreach obj,$(TEST_CXX_OBJS),$(basename $(notdir $(obj))))))
-TEST_BINS := $(TEST_CXX_BINS) $(TEST_CU_BINS)
+TEST_BINS := $(TEST_CXX_BINS) 
 # TEST_ALL_BIN is the test binary that links caffe dynamically.
 TEST_ALL_BIN := $(TEST_BIN_DIR)/test_all.testbin
 
@@ -139,35 +131,45 @@ TEST_ALL_BIN := $(TEST_BIN_DIR)/test_all.testbin
 ##############################
 WARNS_EXT := warnings.txt
 CXX_WARNS := $(addprefix $(BUILD_DIR)/, ${CXX_SRCS:.cpp=.o.$(WARNS_EXT)})
-CU_WARNS := $(addprefix $(BUILD_DIR)/cuda/, ${CU_SRCS:.cu=.o.$(WARNS_EXT)})
 TOOL_WARNS := $(addprefix $(BUILD_DIR)/, ${TOOL_SRCS:.cpp=.o.$(WARNS_EXT)})
 EXAMPLE_WARNS := $(addprefix $(BUILD_DIR)/, ${EXAMPLE_SRCS:.cpp=.o.$(WARNS_EXT)})
 TEST_WARNS := $(addprefix $(BUILD_DIR)/, ${TEST_SRCS:.cpp=.o.$(WARNS_EXT)})
-TEST_CU_WARNS := $(addprefix $(BUILD_DIR)/cuda/, ${TEST_CU_SRCS:.cu=.o.$(WARNS_EXT)})
 ALL_CXX_WARNS := $(CXX_WARNS) $(TOOL_WARNS) $(EXAMPLE_WARNS) $(TEST_WARNS)
-ALL_CU_WARNS := $(CU_WARNS) $(TEST_CU_WARNS)
-ALL_WARNS := $(ALL_CXX_WARNS) $(ALL_CU_WARNS)
+ALL_WARNS := $(ALL_CXX_WARNS) 
 
 EMPTY_WARN_REPORT := $(BUILD_DIR)/.$(WARNS_EXT)
 NONEMPTY_WARN_REPORT := $(BUILD_DIR)/$(WARNS_EXT)
 
-##############################
-# Derive include and lib directories
-##############################
-CUDA_INCLUDE_DIR := $(CUDA_DIR)/include
+#################################
+# OpenCL include and library 
+#################################
+OCL_INCLUDE_DIR := $(OCL_DIR)/include
+CLBLAS_INCLUDE_DIR := ${CLBLAS_DIR}/include
+
+OCL_LIB_DIR := 
+CLBLAS_LIB_DIR :=
+# add <OCL>/lib/x86_64 only if it exists
+ifneq ("$(wildcard $(OCL_LIB_DIR)/lib/x86_64)","")
+        OCL_LIB_DIR += $(OCL_DIR)/lib/x86_64
+endif
+OCL_LIB_DIR += $(OCL_DIR)/lib/x86
+
+# add <CLBLAS_DIR>/lib/ only if it exists
+ifneq ("$(wildcard $(CLBLAS_DIR)/lib)","")
+        CLBLAS_LIB_DIR += $(CLBLAS_LIB_DIR)/lib
+endif
 
-CUDA_LIB_DIR :=
-# add <cuda>/lib64 only if it exists
-ifneq ("$(wildcard $(CUDA_DIR)/lib64)","")
-	CUDA_LIB_DIR += $(CUDA_DIR)/lib64
+# add <CLBLAS_DIR>/lib64/ only if it exists
+ifneq ("$(wildcard $(CLBLAS_DIR)/lib64)","")
+        CLBLAS_LIB_DIR += $(CLBLAS_LIB_DIR)/lib64
 endif
-CUDA_LIB_DIR += $(CUDA_DIR)/lib
 
 INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include
 ifneq ($(CPU_ONLY), 1)
-	INCLUDE_DIRS += $(CUDA_INCLUDE_DIR)
-	LIBRARY_DIRS += $(CUDA_LIB_DIR)
-	LIBRARIES := cudart cublas curand
+        INCLUDE_DIRS += $(OCL_INCLUDE_DIR) + $(CLBLAS_INCLUDE_DIR)
+        LIBRARY_DIRS += $(OCL_LIB_DIR) + $(CLBLAS_LIB_DIR)
+        LIBRARIES += OpenCL clBLAS
+
 endif
 LIBRARIES += glog gflags protobuf leveldb snappy \
 	lmdb boost_system hdf5_hl hdf5 m \
@@ -187,7 +189,6 @@ ifneq ($(strip $(DISTRIBUTE_DIR)),distribute)
 endif
 
 ALL_BUILD_DIRS := $(sort $(BUILD_DIR) $(addprefix $(BUILD_DIR)/, $(SRC_DIRS)) \
-	$(addprefix $(BUILD_DIR)/cuda/, $(SRC_DIRS)) \
 	$(LIB_BUILD_DIR) $(TEST_BIN_DIR) $(PY_PROTO_BUILD_DIR) $(LINT_OUTPUT_DIR) \
 	$(DISTRIBUTE_SUBDIRS) $(PROTO_BUILD_INCLUDE_DIR))
 
@@ -206,7 +207,7 @@ DOXYGEN_SOURCES := $(shell find \
 	matlab/ \
 	examples \
 	tools \
-	-name "*.cpp" -or -name "*.hpp" -or -name "*.cu" -or -name "*.cuh" -or \
+	-name "*.cpp" -or -name "*.hpp"  -or \
         -name "*.py" -or -name "*.m")
 DOXYGEN_SOURCES += $(DOXYGEN_CONFIG_FILE)
 
@@ -242,13 +243,8 @@ endif
 ifeq ($(OSX), 1)
 	CXX := /usr/bin/clang++
 	ifneq ($(CPU_ONLY), 1)
-		CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release \d' | grep -o '\d')
-		ifeq ($(shell echo | awk '{exit $(CUDA_VERSION) < 7.0;}'), 1)
-			CXXFLAGS += -stdlib=libstdc++
-			LINKFLAGS += -stdlib=libstdc++
-		endif
-		# clang throws this warning for cuda headers
-		WARNINGS += -Wno-unneeded-internal-declaration
+	    # todo
+            #############
 	endif
 	# gtest needs to use its own tuple to not conflict with clang
 	COMMON_FLAGS += -DGTEST_USE_OWN_TR1_TUPLE=1
@@ -284,12 +280,6 @@ else
 	COMMON_FLAGS += -DNDEBUG -O2
 endif
 
-# cuDNN acceleration configuration.
-ifeq ($(USE_CUDNN), 1)
-	LIBRARIES += cudnn
-	COMMON_FLAGS += -DUSE_CUDNN
-endif
-
 # CPU-only configuration
 ifeq ($(CPU_ONLY), 1)
 	OBJS := $(PROTO_OBJS) $(CXX_OBJS)
@@ -374,7 +364,7 @@ PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library))
 #
 # * Recursive with the exception that symbolic links are never followed, per the
 # default behavior of 'find'.
-SUPERCLEAN_EXTS := .so .a .o .bin .testbin .pb.cc .pb.h _pb2.py .cuo
+SUPERCLEAN_EXTS := .so .a .o .bin .testbin .pb.cc .pb.h _pb2.py 
 
 # Set the sub-targets of the 'everything' target.
 EVERYTHING_TARGETS := all py$(PROJECT) test warn lint
@@ -525,26 +515,12 @@ $(PROTO_BUILD_DIR)/%.pb.o: $(PROTO_BUILD_DIR)/%.pb.cc $(PROTO_GEN_HEADER) \
 		|| (cat $@.$(WARNS_EXT); exit 1)
 	@ cat $@.$(WARNS_EXT)
 
-$(BUILD_DIR)/cuda/%.o: %.cu | $(ALL_BUILD_DIRS)
-	@ echo NVCC $<
-	$(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
-		-odir $(@D)
-	$(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ 2> $@.$(WARNS_EXT) \
-		|| (cat $@.$(WARNS_EXT); exit 1)
-	@ cat $@.$(WARNS_EXT)
-
 $(TEST_ALL_BIN): $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \
 		| $(DYNAMIC_NAME) $(TEST_BIN_DIR)
 	@ echo CXX/LD -o $@ $<
 	$(Q)$(CXX) $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \
 		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib
 
-$(TEST_CU_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CU_BUILD_DIR)/%.o \
-	$(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR)
-	@ echo LD $<
-	$(Q)$(CXX) $(TEST_MAIN_SRC) $< $(GTEST_OBJ) \
-		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib
-
 $(TEST_CXX_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CXX_BUILD_DIR)/%.o \
 	$(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR)
 	@ echo LD $<
diff --git a/Makefile.config b/Makefile.config
new file mode 100644
index 00000000..eea4c1f3
--- /dev/null
+++ b/Makefile.config
@@ -0,0 +1,100 @@
+## Refer to http://caffe.berkeleyvision.org/installation.html
+# Contributions simplifying and improving our build system are welcome!
+
+# Use OpenCL
+  USE_OPENCL := 1
+# OpenCL directory
+  OCL_DIR := /opt/AMDAPPSDK-2.9-1
+# clBLAS directory
+  CLBLAS_DIR := /opt/clBLAS-2.1
+
+# cuDNN acceleration switch (uncomment to build with cuDNN).
+# USE_CUDNN := 1
+
+# CPU-only switch (uncomment to build without GPU support).
+# CPU_ONLY := 1
+
+# To customize your choice of compiler, uncomment and set the following.
+# N.B. the default for Linux is g++ and the default for OSX is clang++
+# CUSTOM_CXX := g++
+
+# CUDA directory contains bin/ and lib/ directories that we need.
+#CUDA_DIR := /usr/local/cuda
+# On Ubuntu 14.04, if cuda tools are installed via
+# "sudo apt-get install nvidia-cuda-toolkit" then use this instead:
+# CUDA_DIR := /usr
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+#CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
+		-gencode arch=compute_20,code=sm_21 \
+		-gencode arch=compute_30,code=sm_30 \
+		-gencode arch=compute_35,code=sm_35 \
+		-gencode arch=compute_50,code=sm_50 \
+		-gencode arch=compute_50,code=compute_50
+
+# BLAS choice:
+# atlas for ATLAS (default)
+# mkl for MKL
+# open for OpenBlas
+BLAS := atlas
+# Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
+# Leave commented to accept the defaults for your choice of BLAS
+# (which should work)!
+# BLAS_INCLUDE := /path/to/your/blas
+# BLAS_LIB := /path/to/your/blas
+
+# Homebrew puts openblas in a directory that is not on the standard search path
+# BLAS_INCLUDE := $(shell brew --prefix openblas)/include
+# BLAS_LIB := $(shell brew --prefix openblas)/lib
+
+# This is required only if you will compile the matlab interface.
+# MATLAB directory should contain the mex binary in /bin.
+# MATLAB_DIR := /usr/local
+# MATLAB_DIR := /Applications/MATLAB_R2012b.app
+
+# NOTE: this is required only if you will compile the python interface.
+# We need to be able to find Python.h and numpy/arrayobject.h.
+PYTHON_INCLUDE := /usr/include/python2.7 \
+		/usr/lib/python2.7/dist-packages/numpy/core/include
+# Anaconda Python distribution is quite popular. Include path:
+# Verify anaconda location, sometimes it's in root.
+# ANACONDA_HOME := $(HOME)/anaconda
+# PYTHON_INCLUDE := $(ANACONDA_HOME)/include \
+		# $(ANACONDA_HOME)/include/python2.7 \
+		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
+
+# We need to be able to find libpythonX.X.so or .dylib.
+PYTHON_LIB := /usr/lib
+# PYTHON_LIB := $(ANACONDA_HOME)/lib
+
+# Homebrew installs numpy in a non standard path (keg only)
+# PYTHON_INCLUDE += $(dir $(shell python -c 'import numpy.core; print(numpy.core.__file__)'))/include
+# PYTHON_LIB += $(shell brew --prefix numpy)/lib
+
+# Uncomment to support layers written in Python (will link against Python libs)
+# WITH_PYTHON_LAYER := 1
+
+# Whatever else you find you need goes here.
+INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
+LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
+
+# If Homebrew is installed at a non standard location (for example your home directory) and you use it for general dependencies
+# INCLUDE_DIRS += $(shell brew --prefix)/include
+# LIBRARY_DIRS += $(shell brew --prefix)/lib
+
+# Uncomment to use `pkg-config` to specify OpenCV library paths.
+# (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
+# USE_PKG_CONFIG := 1
+
+BUILD_DIR := build
+DISTRIBUTE_DIR := distribute
+
+# Uncomment for debugging. Does not work on OSX due to https://github.com/BVLC/caffe/issues/171
+ DEBUG := 1
+
+# The ID of the GPU that 'make runtest' will use to run unit tests.
+TEST_GPUID := 0
+
+# enable pretty build (comment to see full commands)
+Q ?= @
diff --git a/README.md b/README.md
index ebec286d..ebc83a1a 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,57 @@
-# Caffe
+#OpenCL Caffe
+
+This is an OpenCL implementation of Caffe, a mainstream DNN framework (https://github.com/BVLC/caffe). It includes a largely complete Caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome.
+
+OpenCL (https://en.wikipedia.org/wiki/OpenCL) is an open standard parallel programming language for heterogeneous platforms. OpenCL is supported by a variety of commercial chip manufacturers. 
+
+#Design features
+  -All Caffe layers ported to OpenCL
+
+  -Performance improvement by batched implementation for conv layer based on clBLAS
+
+  -The user can choose the optimal batch number depending on H/W properties, image size and minibatch size
+
+  -Supports OpenCL 2.0, 1.2
+  
+  -Implemented in C++ and OpenCL, maintaining the same interfaces as the original Caffe
+
+  -Users can directly run DNN models: AlexNet, VGG-16 and VGG-19
+
+Note: More features are planned in the near future. Currently this implementation has been verified and tuned on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered for future addition.
+
+#Performance
+
+We intend to keep updating the latest performance as we make optimizations. Fury results are preliminary and are actively being improved.
+
+* Training speed (Model: AlexNet, minibatch size 128)
+
+    -AMD W9100, 255 images per second
+
+    -AMD R9 Fury, 261 images per second
+
+* Recognition speed (Model: AlexNet, minibatch size 128)
+
+    -AMD W9100, 590 images per second
+
+    -AMD R9 Fury, 699 images per second
+
+#Wiki
+For more information on how to install, use or contribute to this code base, please visit our wiki page:
+ https://github.com/amd/OpenCL-caffe/wiki
+
+#Contributors
+Junli Gu, Yibing Liu, Yuan Gao, Maohua Zhu
+
+We thank Mauricio Breternitz, Hanjin Chu and Greg Stoner for their technical suggestions and support. 
+
+#Support needed
+ As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from the community to improve it together.
+
+#License
+The original Caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or whichever your preferred license.
+
+# Original Caffe information
+## Caffe
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
 It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and community contributors.
diff --git a/cmake/CaffeConfig.cmake b/cmake/CaffeConfig.cmake
new file mode 100644
index 00000000..076edc5d
--- /dev/null
+++ b/cmake/CaffeConfig.cmake
@@ -0,0 +1,61 @@
+# Config file for the Caffe package.
+#
+# Note:
+#   Caffe and this config file depends on opencv,
+#   so put `find_package(OpenCV)` before searching Caffe
+#   via `find_package(Caffe)`. All other lib/includes
+#   dependencies are hard coded in the file
+#
+# After successful configuration the following variables
+# will be defined:
+#
+#   Caffe_INCLUDE_DIRS - Caffe include directories
+#   Caffe_LIBRARIES    - libraries to link against
+#   Caffe_DEFINITIONS  - a list of definitions to pass to compiler
+#
+#   Caffe_HAVE_CUDA    - signals about CUDA support
+#   Caffe_HAVE_CUDNN   - signals about cuDNN support
+
+
+# OpenCV dependency
+
+if(NOT OpenCV_FOUND)
+  set(Caffe_OpenCV_CONFIG_PATH "/usr/local/share/OpenCV")
+  if(Caffe_OpenCV_CONFIG_PATH)
+    get_filename_component(Caffe_OpenCV_CONFIG_PATH ${Caffe_OpenCV_CONFIG_PATH} ABSOLUTE)
+
+    if(EXISTS ${Caffe_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core)
+      message(STATUS "Caffe: using OpenCV config from ${Caffe_OpenCV_CONFIG_PATH}")
+      include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVModules.cmake)
+    endif()
+
+  else()
+    find_package(OpenCV REQUIRED)
+  endif()
+  unset(Caffe_OpenCV_CONFIG_PATH)
+endif()
+
+# Compute paths
+get_filename_component(Caffe_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+set(Caffe_INCLUDE_DIRS "/usr/local/include;/usr/include;/opt/AMDAPPSDK-2.9-1/include;/opt/clBLAS-2.1/include;/usr/local/include/opencv;/usr/include/atlas")
+
+get_filename_component(__caffe_include "${Caffe_CMAKE_DIR}/../../include" ABSOLUTE)
+list(APPEND Caffe_INCLUDE_DIRS ${__caffe_include})
+unset(__caffe_include)
+
+
+# Our library dependencies
+if(NOT TARGET caffe AND NOT caffe_BINARY_DIR)
+  include("${Caffe_CMAKE_DIR}/CaffeTargets.cmake")
+endif()
+
+# List of IMPORTED libs created by CaffeTargets.cmake
+set(Caffe_LIBRARIES caffe)
+
+# Definitions
+set(Caffe_DEFINITIONS "-DCPU_ONLY")
+
+# Cuda support variables
+set(Caffe_CPU_ONLY OFF)
+set(Caffe_HAVE_CUDA FALSE)
+set(Caffe_HAVE_CUDNN FALSE)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 7c86dd55..eb72e89f 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -44,17 +44,27 @@ include_directories(SYSTEM ${Snappy_INCLUDE_DIR})
 list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES})
 
 # ---[ CUDA
-include(cmake/Cuda.cmake)
-if(NOT HAVE_CUDA)
-  if(CPU_ONLY)
-    message("-- CUDA is disabled. Building without it...")
-  else()
-    message("-- CUDA is not detected by cmake. Building without it...")
-  endif()
+#include(cmake/Cuda.cmake)
+#if(NOT HAVE_CUDA)
+#  if(CPU_ONLY)
+#    message("-- CUDA is disabled. Building without it...")
+#  else()
+#    message("-- CUDA is not detected by cmake. Building without it...")
+#  endif()
 
   # TODO: remove this not cross platform define in future. Use caffe_config.h instead.
-  add_definitions(-DCPU_ONLY)
-endif()
+#  add_definitions(-DCPU_ONLY)
+#endif()
+
+# ---[ OpenCL
+find_package(OpenCL REQUIRED)
+include_directories(SYSTEM ${OPENCL_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS ${OPENCL_LIBRARIES})
+
+# ---[ clBLAS
+find_package(clBLAS REQUIRED)
+include_directories(SYSTEM ${CLBLAS_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS ${CLBLAS_LIBRARIES})
 
 # ---[ OpenCV
 find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake
new file mode 100644
index 00000000..93abd4f9
--- /dev/null
+++ b/cmake/Modules/FindOpenCL.cmake
@@ -0,0 +1,108 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+# Locate an OpenCL implementation.
+# Currently supports AMD APP SDK (http://developer.amd.com/sdks/AMDAPPSDK/Pages/default.aspx/)
+#
+# Defines the following variables:
+#
+#   OPENCL_FOUND - Found the OPENCL framework
+#   OPENCL_INCLUDE_DIRS - Include directories
+#
+# Also defines the library variables below as normal
+# variables.  These contain debug/optimized keywords when
+# a debugging library is found.
+#
+#   OPENCL_LIBRARIES - libopencl
+#
+# Accepts the following variables as input:
+#
+#   OPENCL_ROOT - (as a CMake or environment variable)
+#                The root directory of the OpenCL implementation found
+#
+#   FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findOpenCL should search for
+#                              64bit or 32bit libs
+#-----------------------
+# Example Usage:
+#
+#    find_package(OPENCL REQUIRED)
+#    include_directories(${OPENCL_INCLUDE_DIRS})
+#
+#    add_executable(foo foo.cc)
+#    target_link_libraries(foo ${OPENCL_LIBRARIES})
+#
+#-----------------------
+
+set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON)
+
+find_path(OPENCL_INCLUDE_DIRS
+    NAMES OpenCL/cl.h CL/cl.h
+    HINTS
+        ${OPENCL_ROOT}/include
+        $ENV{AMDAPPSDKROOT}/include
+        $ENV{CUDA_PATH}/include
+    PATHS
+        /usr/include
+        /usr/local/include
+        /usr/local/cuda/include
+        /opt/cuda/include
+    DOC "OpenCL header file path"
+)
+mark_as_advanced( OPENCL_INCLUDE_DIRS )
+
+# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else
+get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
+
+if( LIB64 )
+    find_library( OPENCL_LIBRARIES
+        NAMES OpenCL
+        HINTS
+            ${OPENCL_ROOT}/lib
+            $ENV{AMDAPPSDKROOT}/lib
+            $ENV{CUDA_PATH}/lib
+        DOC "OpenCL dynamic library path"
+        PATH_SUFFIXES x86_64 x64 x86_64/sdk
+        PATHS
+            /usr/lib
+            /usr/local/cuda/lib
+            /opt/cuda/lib
+    )
+else( )
+    find_library( OPENCL_LIBRARIES
+        NAMES OpenCL
+        HINTS
+            ${OPENCL_ROOT}/lib
+            $ENV{AMDAPPSDKROOT}/lib
+            $ENV{CUDA_PATH}/lib
+        DOC "OpenCL dynamic library path"
+        PATH_SUFFIXES x86 Win32
+        PATHS
+            /usr/lib
+            /usr/local/cuda/lib
+            /opt/cuda/lib
+    )
+endif( )
+mark_as_advanced( OPENCL_LIBRARIES )
+
+include( FindPackageHandleStandardArgs )
+FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )
+
+if( NOT OPENCL_FOUND )
+    message( STATUS "FindOpenCL looked for libraries named: OpenCL" )
+else ()
+    message( STATUS "Found OpenCL  (include: ${OPENCL_INCLUDE_DIRS}, library: ${OPENCL_LIBRARIES})")
+endif()
diff --git a/cmake/Modules/FindclBLAS.cmake b/cmake/Modules/FindclBLAS.cmake
new file mode 100644
index 00000000..1fa28762
--- /dev/null
+++ b/cmake/Modules/FindclBLAS.cmake
@@ -0,0 +1,98 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+# Locate an clBLAS library.
+#
+# Defines the following variables:
+#
+#   CLBLAS_FOUND - Found the CLBLAS library
+#   CLBLAS_INCLUDE_DIRS - Include directories
+#
+# Also defines the library variables below as normal
+# variables.  These contain debug/optimized keywords when
+# a debugging library is found.
+#
+#   CLBLAS_LIBRARIES - libclBLAS
+#
+# Accepts the following variables as input:
+#
+#   CLBLAS_ROOT - (as a CMake or environment variable)
+#                The root directory of the clBLAS library found
+#
+#   FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findclBLAS should search for
+#                              64bit or 32bit libs
+#-----------------------
+# Example Usage:
+#
+#    find_package(clBLAS REQUIRED)
+#    include_directories(${CLBLAS_INCLUDE_DIRS})
+#
+#    add_executable(foo foo.cc)
+#    target_link_libraries(foo ${CLBLAS_LIBRARIES})
+#
+#-----------------------
+
+set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON)
+
+find_path(CLBLAS_INCLUDE_DIRS  NAMES clBLAS.h  
+    HINTS
+        $ENV{CLBLAS_ROOT}/include
+    PATHS
+        /usr/include
+        /usr/local/include
+    DOC "clBLAS header file path"
+)
+mark_as_advanced( CLBLAS_INCLUDE_DIRS )
+
+# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else
+get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
+
+if( LIB64 )
+    find_library( CLBLAS_LIBRARIES
+        NAMES clBLAS
+        HINTS
+            $ENV{CLBLAS_ROOT}/lib64
+        DOC "clBLAS dynamic library path"
+        PATHS
+            /usr/lib
+            /usr/local/lib
+    )
+else( )
+    find_library( CLBLAS_LIBRARIES
+        NAMES clBLAS
+        HINTS
+            $ENV{CLBLAS_ROOT}/lib
+        DOC "clBLAS dynamic library path"
+        PATHS
+            /usr/lib
+            /usr/local/lib
+    )
+endif( )
+mark_as_advanced( CLBLAS_LIBRARIES )
+
+if (NOT CLBLAS_INCLUDE_DIRS)
+   set(CLBLAS_FOUND ON)
+endif()
+
+include( FindPackageHandleStandardArgs )
+FIND_PACKAGE_HANDLE_STANDARD_ARGS( CLBLAS DEFAULT_MSG CLBLAS_LIBRARIES CLBLAS_INCLUDE_DIRS )
+
+if( NOT CLBLAS_FOUND )
+    message( STATUS "FindclBLAS looked for libraries named: clBLAS" )
+else ()
+    message( STATUS "Found clBLAS  (include: ${CLBLAS_INCLUDE_DIRS}, library: ${CLBLAS_LIBRARIES})")
+endif()
diff --git a/cmake/OpenCL.cmake b/cmake/OpenCL.cmake
new file mode 100644
index 00000000..c83ce7eb
--- /dev/null
+++ b/cmake/OpenCL.cmake
@@ -0,0 +1,26 @@
+if(CPU_ONLY)
+  return()
+endif()
+
+#find_path(OCL_INCLUDE_DIR  NAMES  CL/cl.h PATHS "$ENV{AMDAPPSDKROOT}/include")
+#find_library(OCL_LIBRARIES NAMES libOpenCL.so PATHS "$ENV{AMDAPPSDKROOT}/lib/x86_64")
+
+#find_path(CLBLAS_INCLUDE_DIR  NAMES clBLAS.h     PATHS /opt/clBLAS-2.1/include  $ENV{C_INCLUDE_PATH} $ENV{CPLUS_INCLUDE_PATH})
+#find_library(CLBLAS_LIBRARIES NAMES libclBLAS.so PATHS $ENV{LD_LIBRARY_PATH})
+
+#if(OCL_INCLUDE_DIR AND OCL_LIBRARIES)
+#    set(OCL_FOUND  TRUE PARENT_SCOPE)
+#    message(STATUS "Found OpenCL (include: ${OCL_INCLUDE_DIR}, library: ${OCL_LIBRARIES})")
+#endif()
+
+#if(CLBLAS_INCLUDE_DIR AND CLBLAS_LIBRARIES)
+#    set(CLBLAS_FOUND  TRUE PARENT_SCOPE)
+#endif()
+
+#set(OCL_INCLUDE_DIR /opt/AMDAPPSDK-2.9-1/include)
+#set(OCL_LIBRARIES /opt/AMDAPPSDK-2.9-1/lib/x86_64/libOpenCL.so)
+set(CLBLAS_INCLUDE_DIR /opt/clBLAS-2.1/include)
+set(CLBLAS_LIBRARIES /opt/clBLAS-2.1/lib64/libclBLAS.so)
+
+
+
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index e094ac00..2d95b0a9 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -125,7 +125,9 @@ function(caffe_print_configuration_summary)
   caffe_status("  Snappy            : " SNAPPY_FOUND THEN "Yes (ver. ${Snappy_VERSION})" ELSE "No" )
   caffe_status("  LevelDB           : " LEVELDB_FOUND THEN  "Yes (ver. ${LEVELDB_VERSION})" ELSE "No")
   caffe_status("  OpenCV            :   Yes (ver. ${OpenCV_VERSION})")
-  caffe_status("  CUDA              : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" )
+#  caffe_status("  CUDA              : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" )
+  caffe_status("  OpenCL            : " OPENCL_FOUND THEN "Yes" ELSE "No")
+  caffe_status("  clBLAS            : " CLBLAS_FOUND THEN "Yes" ELSE "No")
   caffe_status("")
   if(HAVE_CUDA)
     caffe_status("NVIDIA CUDA:")
diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 6039e8f6..ca9a3a9a 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -14,6 +14,10 @@
 /* NVIDA cuDNN */
 #cmakedefine CPU_ONLY
 
+/* OpenCL & clBLAS*/
+#cmakedefine OCL_FOUND
+#cmakedefine CLBLAS_FOUND
+
 /* Test device */
 #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}
 
diff --git a/examples/imagenet/train_alexnet.sh b/examples/imagenet/train_alexnet.sh
new file mode 100755
index 00000000..58e5229f
--- /dev/null
+++ b/examples/imagenet/train_alexnet.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+GLOG_logtostderr=0 ./build/tools/caffe train \
+    --solver=models/bvlc_alexnet/solver.prototxt
diff --git a/examples/imagenet/train_alexnet_cpu.sh b/examples/imagenet/train_alexnet_cpu.sh
new file mode 100755
index 00000000..a86f75fe
--- /dev/null
+++ b/examples/imagenet/train_alexnet_cpu.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+./build/tools/caffe train \
+    --solver=models/bvlc_alexnet/solver_cpu.prototxt
diff --git a/examples/imagenet/train_alexnet_without_dropout.sh b/examples/imagenet/train_alexnet_without_dropout.sh
new file mode 100755
index 00000000..667543bf
--- /dev/null
+++ b/examples/imagenet/train_alexnet_without_dropout.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+GLOG_logtostderr=0 ./build/tools/caffe train \
+    --solver=models/bvlc_alexnet/solver_without_dropout.prototxt
diff --git a/examples/imagenet/train_alexnet_without_dropout_cpu.sh b/examples/imagenet/train_alexnet_without_dropout_cpu.sh
new file mode 100755
index 00000000..12d43fc3
--- /dev/null
+++ b/examples/imagenet/train_alexnet_without_dropout_cpu.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+GLOG_logtostderr=0 ./build/tools/caffe train \
+    --solver=models/bvlc_alexnet/solver_without_dropout_cpu.prototxt
diff --git a/examples/imagenet/train_caffenet_cpu.sh b/examples/imagenet/train_caffenet_cpu.sh
new file mode 100755
index 00000000..4bcebf36
--- /dev/null
+++ b/examples/imagenet/train_caffenet_cpu.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+./build/tools/caffe train \
+    --solver=models/bvlc_reference_caffenet/solver_cpu.prototxt
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 472cc184..9f22a082 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -23,258 +23,279 @@ namespace caffe {
  */
 template <typename Dtype>
 class Blob {
- public:
-  Blob()
-       : data_(), diff_(), count_(0), capacity_(0) {}
+  public:
+    Blob()
+        : data_(), diff_(), count_(0), capacity_(0) {
+    }
 
-  /// @brief Deprecated; use <code>Blob(const vector<int>& shape)</code>.
-  explicit Blob(const int num, const int channels, const int height,
-      const int width);
-  explicit Blob(const vector<int>& shape);
+    /// @brief Deprecated; use <code>Blob(const vector<int>& shape)</code>.
+    explicit Blob(const int num, const int channels, const int height,
+        const int width);
+    explicit Blob(const vector<int>& shape);
 
-  /// @brief Deprecated; use <code>Reshape(const vector<int>& shape)</code>.
-  void Reshape(const int num, const int channels, const int height,
-      const int width);
-  /**
-   * @brief Change the dimensions of the blob, allocating new memory if
-   *        necessary.
-   *
-   * This function can be called both to create an initial allocation
-   * of memory, and to adjust the dimensions of a top blob during Layer::Reshape
-   * or Layer::Forward. When changing the size of blob, memory will only be
-   * reallocated if sufficient memory does not already exist, and excess memory
-   * will never be freed.
-   *
-   * Note that reshaping an input blob and immediately calling Net::Backward is
-   * an error; either Net::Forward or Net::Reshape need to be called to
-   * propagate the new input shape to higher layers.
-   */
-  void Reshape(const vector<int>& shape);
-  void Reshape(const BlobShape& shape);
-  void ReshapeLike(const Blob& other);
-  inline string shape_string() const {
-    ostringstream stream;
-    for (int i = 0; i < shape_.size(); ++i) {
-      stream << shape_[i] << " ";
+    /// @brief Deprecated; use <code>Reshape(const vector<int>& shape)</code>.
+    void Reshape(const int num, const int channels, const int height,
+        const int width);
+    /**
+     * @brief Change the dimensions of the blob, allocating new memory if
+     *        necessary.
+     *
+     * This function can be called both to create an initial allocation
+     * of memory, and to adjust the dimensions of a top blob during Layer::Reshape
+     * or Layer::Forward. When changing the size of blob, memory will only be
+     * reallocated if sufficient memory does not already exist, and excess memory
+     * will never be freed.
+     *
+     * Note that reshaping an input blob and immediately calling Net::Backward is
+     * an error; either Net::Forward or Net::Reshape need to be called to
+     * propagate the new input shape to higher layers.
+     */
+    void Reshape(const vector<int>& shape);
+    void Reshape(const BlobShape& shape);
+    void ReshapeLike(const Blob& other);
+    inline string shape_string() const {
+      ostringstream stream;
+      for (int i = 0; i < shape_.size(); ++i) {
+        stream << shape_[i] << " ";
+      }
+      stream << "(" << count_ << ")";
+      return stream.str();
+    }
+    inline const vector<int>& shape() const {
+      return shape_;
+    }
+    /**
+     * @brief Returns the dimension of the index-th axis (or the negative index-th
+     *        axis from the end, if index is negative).
+     *
+     * @param index the axis index, which may be negative as it will be
+     *        "canonicalized" using CanonicalAxisIndex.
+     *        Dies on out of range index.
+     */
+    inline int shape(int index) const {
+      return shape_[CanonicalAxisIndex(index)];
+    }
+    inline int num_axes() const {
+      return shape_.size();
+    }
+    inline int count() const {
+      return count_;
     }
-    stream << "(" << count_ << ")";
-    return stream.str();
-  }
-  inline const vector<int>& shape() const { return shape_; }
-  /**
-   * @brief Returns the dimension of the index-th axis (or the negative index-th
-   *        axis from the end, if index is negative).
-   *
-   * @param index the axis index, which may be negative as it will be
-   *        "canonicalized" using CanonicalAxisIndex.
-   *        Dies on out of range index.
-   */
-  inline int shape(int index) const {
-    return shape_[CanonicalAxisIndex(index)];
-  }
-  inline int num_axes() const { return shape_.size(); }
-  inline int count() const { return count_; }
 
-  /**
-   * @brief Compute the volume of a slice; i.e., the product of dimensions
-   *        among a range of axes.
-   *
-   * @param start_axis The first axis to include in the slice.
-   *
-   * @param end_axis The first axis to exclude from the slice.
-   */
-  inline int count(int start_axis, int end_axis) const {
-    CHECK_LE(start_axis, end_axis);
-    CHECK_GE(start_axis, 0);
-    CHECK_GE(end_axis, 0);
-    CHECK_LE(start_axis, num_axes());
-    CHECK_LE(end_axis, num_axes());
-    int count = 1;
-    for (int i = start_axis; i < end_axis; ++i) {
-      count *= shape(i);
+    /**
+     * @brief Compute the volume of a slice; i.e., the product of dimensions
+     *        among a range of axes.
+     *
+     * @param start_axis The first axis to include in the slice.
+     *
+     * @param end_axis The first axis to exclude from the slice.
+     */
+    inline int count(int start_axis, int end_axis) const {
+      CHECK_LE(start_axis, end_axis);
+      CHECK_GE(start_axis, 0);
+      CHECK_GE(end_axis, 0);
+      CHECK_LE(start_axis, num_axes());
+      CHECK_LE(end_axis, num_axes());
+      int count = 1;
+      for (int i = start_axis; i < end_axis; ++i) {
+        count *= shape(i);
+      }
+      return count;
+    }
+    /**
+     * @brief Compute the volume of a slice spanning from a particular first
+     *        axis to the final axis.
+     *
+     * @param start_axis The first axis to include in the slice.
+     */
+    inline int count(int start_axis) const {
+      return count(start_axis, num_axes());
     }
-    return count;
-  }
-  /**
-   * @brief Compute the volume of a slice spanning from a particular first
-   *        axis to the final axis.
-   *
-   * @param start_axis The first axis to include in the slice.
-   */
-  inline int count(int start_axis) const {
-    return count(start_axis, num_axes());
-  }
 
-  /**
-   * @brief Returns the 'canonical' version of a (usually) user-specified axis,
-   *        allowing for negative indexing (e.g., -1 for the last axis).
-   *
-   * @param index the axis index.
-   *        If 0 <= index < num_axes(), return index.
-   *        If -num_axes <= index <= -1, return (num_axes() - (-index)),
-   *        e.g., the last axis index (num_axes() - 1) if index == -1,
-   *        the second to last if index == -2, etc.
-   *        Dies on out of range index.
-   */
-  inline int CanonicalAxisIndex(int axis_index) const {
-    CHECK_GE(axis_index, -num_axes())
-        << "axis " << axis_index << " out of range for " << num_axes()
-        << "-D Blob with shape " << shape_string();
-    CHECK_LT(axis_index, num_axes())
-        << "axis " << axis_index << " out of range for " << num_axes()
-        << "-D Blob with shape " << shape_string();
-    if (axis_index < 0) {
-      return axis_index + num_axes();
+    /**
+     * @brief Returns the 'canonical' version of a (usually) user-specified axis,
+     *        allowing for negative indexing (e.g., -1 for the last axis).
+     *
+     * @param index the axis index.
+     *        If 0 <= index < num_axes(), return index.
+     *        If -num_axes <= index <= -1, return (num_axes() - (-index)),
+     *        e.g., the last axis index (num_axes() - 1) if index == -1,
+     *        the second to last if index == -2, etc.
+     *        Dies on out of range index.
+     */
+    inline int CanonicalAxisIndex(int axis_index) const {
+      CHECK_GE(axis_index, -num_axes()) << "axis " << axis_index
+          << " out of range for " << num_axes() << "-D Blob with shape "
+          << shape_string();
+      CHECK_LT(axis_index, num_axes()) << "axis " << axis_index
+          << " out of range for " << num_axes() << "-D Blob with shape "
+          << shape_string();
+      if (axis_index < 0) {
+        return axis_index + num_axes();
+      }
+      return axis_index;
     }
-    return axis_index;
-  }
 
-  /// @brief Deprecated legacy shape accessor num: use shape(0) instead.
-  inline int num() const { return LegacyShape(0); }
-  /// @brief Deprecated legacy shape accessor channels: use shape(1) instead.
-  inline int channels() const { return LegacyShape(1); }
-  /// @brief Deprecated legacy shape accessor height: use shape(2) instead.
-  inline int height() const { return LegacyShape(2); }
-  /// @brief Deprecated legacy shape accessor width: use shape(3) instead.
-  inline int width() const { return LegacyShape(3); }
-  inline int LegacyShape(int index) const {
-    CHECK_LE(num_axes(), 4)
-        << "Cannot use legacy accessors on Blobs with > 4 axes.";
-    CHECK_LT(index, 4);
-    CHECK_GE(index, -4);
-    if (index >= num_axes() || index < -num_axes()) {
-      // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse
-      // indexing) -- this special case simulates the one-padding used to fill
-      // extraneous axes of legacy blobs.
-      return 1;
+    /// @brief Deprecated legacy shape accessor num: use shape(0) instead.
+    inline int num() const {
+      return LegacyShape(0);
+    }
+    /// @brief Deprecated legacy shape accessor channels: use shape(1) instead.
+    inline int channels() const {
+      return LegacyShape(1);
+    }
+    /// @brief Deprecated legacy shape accessor height: use shape(2) instead.
+    inline int height() const {
+      return LegacyShape(2);
+    }
+    /// @brief Deprecated legacy shape accessor width: use shape(3) instead.
+    inline int width() const {
+      return LegacyShape(3);
+    }
+    inline int LegacyShape(int index) const {
+      CHECK_LE(num_axes(), 4)
+          << "Cannot use legacy accessors on Blobs with > 4 axes.";
+      CHECK_LT(index, 4);
+      CHECK_GE(index, -4);
+      if (index >= num_axes() || index < -num_axes()) {
+        // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse
+        // indexing) -- this special case simulates the one-padding used to fill
+        // extraneous axes of legacy blobs.
+        return 1;
+      }
+      return shape(index);
     }
-    return shape(index);
-  }
 
-  inline int offset(const int n, const int c = 0, const int h = 0,
-      const int w = 0) const {
-    CHECK_GE(n, 0);
-    CHECK_LE(n, num());
-    CHECK_GE(channels(), 0);
-    CHECK_LE(c, channels());
-    CHECK_GE(height(), 0);
-    CHECK_LE(h, height());
-    CHECK_GE(width(), 0);
-    CHECK_LE(w, width());
-    return ((n * channels() + c) * height() + h) * width() + w;
-  }
+    inline int offset(const int n, const int c = 0, const int h = 0,
+        const int w = 0) const {
+      CHECK_GE(n, 0);
+      CHECK_LE(n, num());
+      CHECK_GE(channels(), 0);
+      CHECK_LE(c, channels());
+      CHECK_GE(height(), 0);
+      CHECK_LE(h, height());
+      CHECK_GE(width(), 0);
+      CHECK_LE(w, width());
+      return ((n * channels() + c) * height() + h) * width() + w;
+    }
 
-  inline int offset(const vector<int>& indices) const {
-    CHECK_LE(indices.size(), num_axes());
-    int offset = 0;
-    for (int i = 0; i < num_axes(); ++i) {
-      offset *= shape(i);
-      if (indices.size() > i) {
-        CHECK_GE(indices[i], 0);
-        CHECK_LT(indices[i], shape(i));
-        offset += indices[i];
+    inline int offset(const vector<int>& indices) const {
+      CHECK_LE(indices.size(), num_axes());
+      int offset = 0;
+      for (int i = 0; i < num_axes(); ++i) {
+        offset *= shape(i);
+        if (indices.size() > i) {
+          CHECK_GE(indices[i], 0);
+          CHECK_LT(indices[i], shape(i));
+          offset += indices[i];
+        }
       }
+      return offset;
     }
-    return offset;
-  }
-  /**
-   * @brief Copy from a source Blob.
-   *
-   * @param source the Blob to copy from
-   * @param copy_diff if false, copy the data; if true, copy the diff
-   * @param reshape if false, require this Blob to be pre-shaped to the shape
-   *        of other (and die otherwise); if true, Reshape this Blob to other's
-   *        shape if necessary
-   */
-  void CopyFrom(const Blob<Dtype>& source, bool copy_diff = false,
-      bool reshape = false);
+    /**
+     * @brief Copy from a source Blob.
+     *
+     * @param source the Blob to copy from
+     * @param copy_diff if false, copy the data; if true, copy the diff
+     * @param reshape if false, require this Blob to be pre-shaped to the shape
+     *        of other (and die otherwise); if true, Reshape this Blob to other's
+     *        shape if necessary
+     */
+    void CopyFrom(const Blob<Dtype>& source, bool copy_diff = false,
+        bool reshape = false);
 
-  inline Dtype data_at(const int n, const int c, const int h,
-      const int w) const {
-    return cpu_data()[offset(n, c, h, w)];
-  }
+    inline Dtype data_at(const int n, const int c, const int h,
+        const int w) const {
+      return cpu_data()[offset(n, c, h, w)];
+    }
 
-  inline Dtype diff_at(const int n, const int c, const int h,
-      const int w) const {
-    return cpu_diff()[offset(n, c, h, w)];
-  }
+    inline Dtype diff_at(const int n, const int c, const int h,
+        const int w) const {
+      return cpu_diff()[offset(n, c, h, w)];
+    }
 
-  inline Dtype data_at(const vector<int>& index) const {
-    return cpu_data()[offset(index)];
-  }
+    inline Dtype data_at(const vector<int>& index) const {
+      return cpu_data()[offset(index)];
+    }
 
-  inline Dtype diff_at(const vector<int>& index) const {
-    return cpu_diff()[offset(index)];
-  }
+    inline Dtype diff_at(const vector<int>& index) const {
+      return cpu_diff()[offset(index)];
+    }
 
-  inline const shared_ptr<SyncedMemory>& data() const {
-    CHECK(data_);
-    return data_;
-  }
+    inline const shared_ptr<SyncedMemory>& data() const {
+      CHECK(data_);
+      return data_;
+    }
 
-  inline const shared_ptr<SyncedMemory>& diff() const {
-    CHECK(diff_);
-    return diff_;
-  }
+    inline const shared_ptr<SyncedMemory>& diff() const {
+      CHECK(diff_);
+      return diff_;
+    }
 
-  const Dtype* cpu_data() const;
-  void set_cpu_data(Dtype* data);
-  const Dtype* gpu_data() const;
-  const Dtype* cpu_diff() const;
-  const Dtype* gpu_diff() const;
-  Dtype* mutable_cpu_data();
-  Dtype* mutable_gpu_data();
-  Dtype* mutable_cpu_diff();
-  Dtype* mutable_gpu_diff();
-  void Update();
-  void FromProto(const BlobProto& proto, bool reshape = true);
-  void ToProto(BlobProto* proto, bool write_diff = false) const;
+    const Dtype* cpu_data() const;
+    void set_cpu_data(Dtype* data);
+    const Dtype* gpu_data() const;
+    const Dtype* gpu_cache_data() const;
+    const Dtype* cpu_diff() const;
+    const Dtype* gpu_diff() const;
+    Dtype* mutable_cpu_data();
+    Dtype* mutable_gpu_data();
+    Dtype* mutable_cpu_diff();
+    Dtype* mutable_gpu_diff();
+    void Update();
+    void FromProto(const BlobProto& proto, bool reshape = true);
+    void ToProto(BlobProto* proto, bool write_diff = false) const;
 
-  /// @brief Compute the sum of absolute values (L1 norm) of the data.
-  Dtype asum_data() const;
-  /// @brief Compute the sum of absolute values (L1 norm) of the diff.
-  Dtype asum_diff() const;
-  /// @brief Compute the sum of squares (L2 norm squared) of the data.
-  Dtype sumsq_data() const;
-  /// @brief Compute the sum of squares (L2 norm squared) of the diff.
-  Dtype sumsq_diff() const;
+    /// @brief Compute the sum of absolute values (L1 norm) of the data.
+    Dtype asum_data() const;
+    /// @brief Compute the sum of absolute values (L1 norm) of the diff.
+    Dtype asum_diff() const;
+    /// @brief Compute the sum of squares (L2 norm squared) of the data.
+    Dtype sumsq_data() const;
+    /// @brief Compute the sum of squares (L2 norm squared) of the diff.
+    Dtype sumsq_diff() const;
 
-  /// @brief Scale the blob data by a constant factor.
-  void scale_data(Dtype scale_factor);
-  /// @brief Scale the blob diff by a constant factor.
-  void scale_diff(Dtype scale_factor);
+    /// @brief Scale the blob data by a constant factor.
+    void scale_data(Dtype scale_factor);
+    /// @brief Scale the blob diff by a constant factor.
+    void scale_diff(Dtype scale_factor);
 
-  /**
-   * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the
-   *        data_ of Blob other -- useful in Layer%s which simply perform a copy
-   *        in their Forward pass.
-   *
-   * This deallocates the SyncedMemory holding this Blob's data_, as
-   * shared_ptr calls its destructor when reset with the "=" operator.
-   */
-  void ShareData(const Blob& other);
-  /**
-   * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the
-   *        diff_ of Blob other -- useful in Layer%s which simply perform a copy
-   *        in their Forward pass.
-   *
-   * This deallocates the SyncedMemory holding this Blob's diff_, as
-   * shared_ptr calls its destructor when reset with the "=" operator.
-   */
-  void ShareDiff(const Blob& other);
+    /**
+     * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the
+     *        data_ of Blob other -- useful in Layer%s which simply perform a copy
+     *        in their Forward pass.
+     *
+     * This deallocates the SyncedMemory holding this Blob's data_, as
+     * shared_ptr calls its destructor when reset with the "=" operator.
+     */
+    void ShareData(const Blob& other);
+    /**
+     * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the
+     *        diff_ of Blob other -- useful in Layer%s which simply perform a copy
+     *        in their Forward pass.
+     *
+     * This deallocates the SyncedMemory holding this Blob's diff_, as
+     * shared_ptr calls its destructor when reset with the "=" operator.
+     */
+    void ShareDiff(const Blob& other);
+    void set_data_layer() {
+      data_->set_data_layer();
+      diff_->set_data_layer();
+    }
 
-  bool ShapeEquals(const BlobProto& other);
+    bool ShapeEquals(const BlobProto& other);
 
- protected:
-  shared_ptr<SyncedMemory> data_;
-  shared_ptr<SyncedMemory> diff_;
-  vector<int> shape_;
-  int count_;
-  int capacity_;
+  protected:
+    shared_ptr<SyncedMemory> data_;
+    shared_ptr<SyncedMemory> diff_;
+    vector<int> shape_;
+    int count_;
+    int capacity_;
 
-  DISABLE_COPY_AND_ASSIGN(Blob);
-};  // class Blob
+    DISABLE_COPY_AND_ASSIGN (Blob);
+};
+// class Blob
 
-}  // namespace caffe
+}// namespace caffe
 
 #endif  // CAFFE_BLOB_HPP_
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 5f86bc26..0b455c59 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -16,7 +16,17 @@
 #include <utility>  // pair
 #include <vector>
 
+#ifndef CPU_ONLY
+#include <clBLAS.h>
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#endif
+
+#include "caffe/device.hpp"
 #include "caffe/util/device_alternate.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
+#include "caffe/util/ocl_util.hpp"
+#include "caffe/util/im2col.hpp"
 
 // gflags 2.1 issue: namespace google was changed to gflags without warning.
 // Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version
@@ -64,9 +74,91 @@ private:\
 // A simple macro to mark codes that are not implemented, so that when the code
 // is executed we will see a fatal log.
 #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
+//OpenCL:  various of defines to choose the design schemes
+/* ifdef: use CPU random generator in dropout layer
+ ifndef: use GPU random generator*/
+//#define use_cpu_generator_dropout
+//#define print_memory_trace
+//the following are macro defines for optimization schmes in conv layer
+/*ifdef: use proposed img_packing scheme;
+ ifndef: use proposed packing im2col + sgemm scheme*/
+#define use_packing_scheme 0
+/* global_packing_N defines packing number of the use_packing scheme
+ for intial design, we use the same packing number for all conv layers*/
+#define global_packing_N 16
+/*ifdef: use multi-command queues for groups in conv layer;
+ ifndef: use single commane queue for groups*/
+//#define multiQ
+//#define check_gradient
+// OpenCL: various checks for different function calls.
+#define OCL_CHECK(condition) \
+  do { \
+    cl_int error = condition; \
+    CHECK_EQ(error, CL_SUCCESS) << " " << error; \
+    if(CL_SUCCESS != error){ \
+       LOG(INFO) << "failed";\
+    } \
+  } while (0)
+
+#define CLBLAS_CHECK(flag) \
+  do { \
+     cl_int error = flag; \
+     CHECK_EQ(error, clblasSuccess) << " " << error; \
+     if (error != clblasSuccess){ \
+         LOG(INFO) << "clBlas Function Failed! Error Code:" << error; \
+     } \
+ } while(0)
+
+//sample #num data from Blob_
+#define CHECK_BLOB_DATA(Blob_, num, marker) \
+do{ \
+  const  Dtype *top_cpu_data = Blob_->cpu_data(); \
+  size_t top_cpu_data_count = Blob_->count(); \
+  size_t sample_interval = top_cpu_data_count/num; \
+  if(sample_interval == 0){ \
+     sample_interval=1; \
+  } \
+  printf("%s: ", marker); \
+  for(int i=0; i<top_cpu_data_count; i+=sample_interval){ \
+      printf("%f  ", top_cpu_data[i]); \
+  } \
+  printf("\n\n"); \
+}while(0)
+
+#define CHECK_GLOBAL_MEM_DATA(global_mem, count, num, marker)\
+do{ \
+  Dtype *global_mem_cpu = new Dtype[count]; \
+  clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem)global_mem, \
+              CL_TRUE, 0, sizeof(Dtype)*count, global_mem_cpu,0, NULL, NULL); \
+  size_t sample_interval = count/num; \
+  if(sample_interval == 0){ \
+     sample_interval=1; \
+  } \
+  printf("%s: ", marker); \
+  for(int i=0; i<count; i+=sample_interval){ \
+      printf("%f  ", global_mem_cpu[i]); \
+  } \
+  printf("\n\n"); \
+  delete []global_mem_cpu; \
+}while(0)
+
+#define CHECK_CPU_MEM_DATA(cpu_mem, count, num, marker)\
+do{ \
+  size_t sample_interval = count/num; \
+  if(sample_interval == 0){ \
+     sample_interval=1; \
+  } \
+  printf("%s: ", marker); \
+  for(int i=0; i<count; i+=sample_interval){ \
+      printf("%f  ", cpu_mem[i]); \
+  } \
+  printf("\n\n"); \
+}while(0)
 
 // See PR #1236
-namespace cv { class Mat; }
+namespace cv {
+class Mat;
+}
 
 namespace caffe {
 
@@ -96,73 +188,79 @@ void GlobalInit(int* pargc, char*** pargv);
 // A singleton class to hold common caffe stuff, such as the handler that
 // caffe is going to use for cublas, curand, etc.
 class Caffe {
- public:
-  ~Caffe();
-  inline static Caffe& Get() {
-    if (!singleton_.get()) {
-      singleton_.reset(new Caffe());
+  public:
+    ~Caffe();
+    inline static Caffe& Get() {
+      if (!singleton_.get()) {
+        singleton_.reset(new Caffe());
+      }
+      return *singleton_;
     }
-    return *singleton_;
-  }
-  enum Brew { CPU, GPU };
-
-  // This random number generator facade hides boost and CUDA rng
-  // implementation from one another (for cross-platform compatibility).
-  class RNG {
-   public:
-    RNG();
-    explicit RNG(unsigned int seed);
-    explicit RNG(const RNG&);
-    RNG& operator=(const RNG&);
-    void* generator();
-   private:
-    class Generator;
-    shared_ptr<Generator> generator_;
-  };
-
-  // Getters for boost rng, curand, and cublas handles
-  inline static RNG& rng_stream() {
-    if (!Get().random_generator_) {
-      Get().random_generator_.reset(new RNG());
+    enum Brew {
+      CPU, GPU, APU
+    };
+
+    // This random number generator facade hides boost and CUDA rng
+    // implementation from one another (for cross-platform compatibility).
+    class RNG {
+      public:
+        RNG();
+        explicit RNG(unsigned int seed);
+        explicit RNG(const RNG&);
+        RNG& operator=(const RNG&);
+        void* generator();
+      private:
+        class Generator;
+        shared_ptr<Generator> generator_;
+    };
+
+    // Getters for boost rng, curand, and cublas handles
+    inline static RNG& rng_stream() {
+      if (!Get().random_generator_) {
+        Get().random_generator_.reset(new RNG());
+      }
+      return *(Get().random_generator_);
     }
-    return *(Get().random_generator_);
-  }
 #ifndef CPU_ONLY
-  inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
-  inline static curandGenerator_t curand_generator() {
-    return Get().curand_generator_;
-  }
+    //inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
+    //inline static curandGenerator_t curand_generator() {
+    //  return Get().curand_generator_;
+    //}
 #endif
 
-  // Returns the mode: running on CPU or GPU.
-  inline static Brew mode() { return Get().mode_; }
-  // The setters for the variables
-  // Sets the mode. It is recommended that you don't change the mode halfway
-  // into the program since that may cause allocation of pinned memory being
-  // freed in a non-pinned way, which may cause problems - I haven't verified
-  // it personally but better to note it here in the header file.
-  inline static void set_mode(Brew mode) { Get().mode_ = mode; }
-  // Sets the random seed of both boost and curand
-  static void set_random_seed(const unsigned int seed);
-  // Sets the device. Since we have cublas and curand stuff, set device also
-  // requires us to reset those values.
-  static void SetDevice(const int device_id);
-  // Prints the current GPU status.
-  static void DeviceQuery();
-
- protected:
+    // Returns the mode: running on CPU or GPU.
+    inline static Brew mode() {
+      return Get().mode_;
+    }
+    // The setters for the variables
+    // Sets the mode. It is recommended that you don't change the mode halfway
+    // into the program since that may cause allocation of pinned memory being
+    // freed in a non-pinned way, which may cause problems - I haven't verified
+    // it personally but better to note it here in the header file.
+    inline static void set_mode(Brew mode) {
+      Get().mode_ = mode;
+    }
+    // Sets the random seed of both boost and curand
+    static void set_random_seed(const unsigned int seed);
+    // Sets the device. Since we have cublas and curand stuff, set device also
+    // requires us to reset those values.
+    static void SetDevice(const int device_id);
+    // Prints the current GPU status.
+    static void DeviceQuery();
+
+  protected:
 #ifndef CPU_ONLY
-  cublasHandle_t cublas_handle_;
-  curandGenerator_t curand_generator_;
+    //cublasHandle_t cublas_handle_;
+    //curandGenerator_t curand_generator_;
 #endif
-  shared_ptr<RNG> random_generator_;
+    shared_ptr<RNG> random_generator_;
 
-  Brew mode_;
-  static shared_ptr<Caffe> singleton_;
+    Brew mode_;
+    static shared_ptr<Caffe> singleton_;
 
- private:
-  // The private constructor to avoid duplicate instantiation.
-  Caffe();
+  private:
+    // The private constructor to avoid duplicate instantiation.
+    Caffe();
 
   DISABLE_COPY_AND_ASSIGN(Caffe);
 };
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index d2c0ce6d..ab796286 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -26,48 +26,56 @@ namespace caffe {
  * NOTE: does not implement Backwards operation.
  */
 template <typename Dtype>
-class ArgMaxLayer : public Layer<Dtype> {
- public:
-  /**
-   * @param param provides ArgMaxParameter argmax_param,
-   *     with ArgMaxLayer options:
-   *   - top_k (\b optional uint, default 1).
-   *     the number @f$ K @f$ of maximal items to output.
-   *   - out_max_val (\b optional bool, default false).
-   *     if set, output a vector of pairs (max_ind, max_val) for each image.
-   */
-  explicit ArgMaxLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "ArgMax"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val
-   *      @f$ (N \times 2 \times K \times 1) @f$
-   *      the computed outputs @f$
-   *       y_n = \arg\max\limits_i x_{ni}
-   *      @f$ (for @f$ K = 1 @f$).
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  /// @brief Not implemented (non-differentiable function)
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-    NOT_IMPLEMENTED;
-  }
-  bool out_max_val_;
-  size_t top_k_;
+class ArgMaxLayer: public Layer<Dtype> {
+  public:
+    /**
+     * @param param provides ArgMaxParameter argmax_param,
+     *     with ArgMaxLayer options:
+     *   - top_k (\b optional uint, default 1).
+     *     the number @f$ K @f$ of maximal items to output.
+     *   - out_max_val (\b optional bool, default false).
+     *     if set, output a vector of pairs (max_ind, max_val) for each image.
+     */
+    explicit ArgMaxLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "ArgMax";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val
+     *      @f$ (N \times 2 \times K \times 1) @f$
+     *      the computed outputs @f$
+     *       y_n = \arg\max\limits_i x_{ni}
+     *      @f$ (for @f$ K = 1 @f$).
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    /// @brief Not implemented (non-differentiable function)
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+      NOT_IMPLEMENTED;
+    }
+    bool out_max_val_;
+    size_t top_k_;
 };
 
 /**
@@ -75,72 +83,79 @@ class ArgMaxLayer : public Layer<Dtype> {
  *        or channel dimension, outputting the result.
  */
 template <typename Dtype>
-class ConcatLayer : public Layer<Dtype> {
- public:
-  explicit ConcatLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Concat"; }
-  virtual inline int MinBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 2+)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x_1 @f$
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x_2 @f$
-   *   -# ...
-   *   - K @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x_K @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
-   *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
-   *      the concatenated output @f$
-   *        y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}]
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the concatenate inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *        respect to the outputs
-   *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
-   *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to concatenated outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length K), into which the top gradient
-   *        @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the
-   *        inputs @f$
-   *        \left[ \begin{array}{cccc}
-   *          \frac{\partial E}{\partial x_1} &
-   *          \frac{\partial E}{\partial x_2} &
-   *          ... &
-   *          \frac{\partial E}{\partial x_K}
-   *        \end{array} \right] =
-   *        \frac{\partial E}{\partial y}
-   *        @f$
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int count_;
-  int num_concats_;
-  int concat_input_size_;
-  int concat_axis_;
+class ConcatLayer: public Layer<Dtype> {
+  public:
+    explicit ConcatLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Concat";
+    }
+    virtual inline int MinBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 2+)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x_1 @f$
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x_2 @f$
+     *   -# ...
+     *   - K @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x_K @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
+     *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
+     *      the concatenated output @f$
+     *        y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}]
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the concatenate inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *        respect to the outputs
+     *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
+     *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to concatenated outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length K), into which the top gradient
+     *        @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the
+     *        inputs @f$
+     *        \left[ \begin{array}{cccc}
+     *          \frac{\partial E}{\partial x_1} &
+     *          \frac{\partial E}{\partial x_2} &
+     *          ... &
+     *          \frac{\partial E}{\partial x_K}
+     *        \end{array} \right] =
+     *        \frac{\partial E}{\partial y}
+     *        @f$
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int count_;
+    int num_concats_;
+    int concat_input_size_;
+    int concat_axis_;
 };
 
 /**
@@ -150,34 +165,41 @@ class ConcatLayer : public Layer<Dtype> {
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
 template <typename Dtype>
-class EltwiseLayer : public Layer<Dtype> {
- public:
-  explicit EltwiseLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Eltwise"; }
-  virtual inline int MinBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  EltwiseParameter_EltwiseOp op_;
-  vector<Dtype> coeffs_;
-  Blob<int> max_idx_;
-
-  bool stable_prod_grad_;
+class EltwiseLayer: public Layer<Dtype> {
+  public:
+    explicit EltwiseLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Eltwise";
+    }
+    virtual inline int MinBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    EltwiseParameter_EltwiseOp op_;
+    vector<Dtype> coeffs_;
+    Blob<int> max_idx_;
+
+    bool stable_prod_grad_;
 };
 
 /**
@@ -187,60 +209,67 @@ class EltwiseLayer : public Layer<Dtype> {
  * item needs to stay).
  */
 template <typename Dtype>
-class FilterLayer : public Layer<Dtype> {
- public:
-  explicit FilterLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Filter"; }
-  virtual inline int MinBottomBlobs() const { return 2; }
-  virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 2+)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs to be filtered @f$ x_1 @f$
-   *   -# ...
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs to be filtered @f$ x_K @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the selector blob
-   * @param top output Blob vector (length 1+)
-   *   -# @f$ (S \times C \times H \times W) @f$ ()
-   *        the filtered output @f$ x_1 @f$
-   *        where S is the number of items
-   *        that haven't been filtered
-   *      @f$ (S \times C \times H \times W) @f$
-   *        the filtered output @f$ x_K @f$
-   *        where S is the number of items
-   *        that haven't been filtered
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the forwarded inputs.
-   *
-   * @param top output Blob vector (length 1+), providing the error gradient with
-   *        respect to the outputs
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 2+), into which the top error
-   *        gradient is copied
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool first_reshape_;
-  vector<int> indices_to_forward_;
+class FilterLayer: public Layer<Dtype> {
+  public:
+    explicit FilterLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Filter";
+    }
+    virtual inline int MinBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 2+)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs to be filtered @f$ x_1 @f$
+     *   -# ...
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs to be filtered @f$ x_K @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the selector blob
+     * @param top output Blob vector (length 1+)
+     *   -# @f$ (S \times C \times H \times W) @f$ ()
+     *        the filtered output @f$ x_1 @f$
+     *        where S is the number of items
+     *        that haven't been filtered
+     *      @f$ (S \times C \times H \times W) @f$
+     *        the filtered output @f$ x_K @f$
+     *        where S is the number of items
+     *        that haven't been filtered
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the forwarded inputs.
+     *
+     * @param top output Blob vector (length 1+), providing the error gradient with
+     *        respect to the outputs
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 2+), into which the top error
+     *        gradient is copied
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    bool first_reshape_;
+    vector<int> indices_to_forward_;
 };
 
 /**
@@ -254,40 +283,47 @@ class FilterLayer : public Layer<Dtype> {
  * (see Blob::ShareDiff).
  */
 template <typename Dtype>
-class FlattenLayer : public Layer<Dtype> {
- public:
-  explicit FlattenLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Flatten"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 2+)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times CHW \times 1 \times 1) @f$
-   *      the outputs -- i.e., the (virtually) copied, flattened inputs
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the concatenate inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *        respect to the outputs
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length K), into which the top error
-   *        gradient is (virtually) copied
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+class FlattenLayer: public Layer<Dtype> {
+  public:
+    explicit FlattenLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Flatten";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 2+)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times CHW \times 1 \times 1) @f$
+     *      the outputs -- i.e., the (virtually) copied, flattened inputs
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the concatenate inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *        respect to the outputs
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length K), into which the top error
+     *        gradient is (virtually) copied
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -297,34 +333,41 @@ class FlattenLayer : public Layer<Dtype> {
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
 template <typename Dtype>
-class InnerProductLayer : public Layer<Dtype> {
- public:
-  explicit InnerProductLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "InnerProduct"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int M_;
-  int K_;
-  int N_;
-  bool bias_term_;
-  Blob<Dtype> bias_multiplier_;
+class InnerProductLayer: public Layer<Dtype> {
+  public:
+    explicit InnerProductLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "InnerProduct";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int M_;
+    int K_;
+    int N_;
+    bool bias_term_;
+    Blob<Dtype> bias_multiplier_;
 };
 
 /**
@@ -333,32 +376,39 @@ class InnerProductLayer : public Layer<Dtype> {
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
 template <typename Dtype>
-class MVNLayer : public Layer<Dtype> {
- public:
-  explicit MVNLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "MVN"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Blob<Dtype> mean_, variance_, temp_;
-
-  /// sum_multiplier is used to carry out sum using BLAS
-  Blob<Dtype> sum_multiplier_;
-  Dtype eps_;
+class MVNLayer: public Layer<Dtype> {
+  public:
+    explicit MVNLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "MVN";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Blob<Dtype> mean_, variance_, temp_;
+
+    /// sum_multiplier is used to carry out sum using BLAS
+    Blob<Dtype> sum_multiplier_;
+    Dtype eps_;
 };
 
 /*
@@ -368,35 +418,48 @@ class MVNLayer : public Layer<Dtype> {
  * (see FlattenLayer, Blob::ShareData and Blob::ShareDiff).
  */
 template <typename Dtype>
-class ReshapeLayer : public Layer<Dtype> {
- public:
-  explicit ReshapeLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Reshape"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-
-  /// @brief vector of axes indices whose dimensions we'll copy from the bottom
-  vector<int> copy_axes_;
-  /// @brief the index of the axis whose dimension we infer, or -1 if none
-  int inferred_axis_;
-  /// @brief the product of the "constant" output dimensions
-  int constant_count_;
+class ReshapeLayer: public Layer<Dtype> {
+  public:
+    explicit ReshapeLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Reshape";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+
+    /// @brief vector of axes indices whose dimensions we'll copy from the bottom
+    vector<int> copy_axes_;
+    /// @brief the index of the axis whose dimension we infer, or -1 if none
+    int inferred_axis_;
+    /// @brief the product of the "constant" output dimensions
+    int constant_count_;
 };
 
 /**
@@ -407,41 +470,48 @@ class ReshapeLayer : public Layer<Dtype> {
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
 template <typename Dtype>
-class ReductionLayer : public Layer<Dtype> {
- public:
-  explicit ReductionLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Reduction"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  /// @brief the reduction operation performed by the layer
-  ReductionParameter_ReductionOp op_;
-  /// @brief a scalar coefficient applied to all outputs
-  Dtype coeff_;
-  /// @brief the index of the first input axis to reduce
-  int axis_;
-  /// @brief the number of reductions performed
-  int num_;
-  /// @brief the input size of each reduction
-  int dim_;
-  /// @brief a helper Blob used for summation (op_ == SUM)
-  Blob<Dtype> sum_multiplier_;
+class ReductionLayer: public Layer<Dtype> {
+  public:
+    explicit ReductionLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Reduction";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    /// @brief the reduction operation performed by the layer
+    ReductionParameter_ReductionOp op_;
+    /// @brief a scalar coefficient applied to all outputs
+    Dtype coeff_;
+    /// @brief the index of the first input axis to reduce
+    int axis_;
+    /// @brief the number of reductions performed
+    int num_;
+    /// @brief the input size of each reduction
+    int dim_;
+    /// @brief a helper Blob used for summation (op_ == SUM)
+    Blob<Dtype> sum_multiplier_;
 };
 
 /**
@@ -449,28 +519,37 @@ class ReductionLayer : public Layer<Dtype> {
  *        to suppress outputs during testing.)
  */
 template <typename Dtype>
-class SilenceLayer : public Layer<Dtype> {
- public:
-  explicit SilenceLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual inline const char* type() const { return "Silence"; }
-  virtual inline int MinBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 0; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-  // We can't define Forward_gpu here, since STUB_GPU will provide
-  // its own definition for CPU_ONLY mode.
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+class SilenceLayer: public Layer<Dtype> {
+  public:
+    explicit SilenceLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+
+    virtual inline const char* type() const {
+      return "Silence";
+    }
+    virtual inline int MinBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 0;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+    // We can't define Forward_gpu here, since STUB_GPU will provide
+    // its own definition for CPU_ONLY mode.
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -479,34 +558,42 @@ class SilenceLayer : public Layer<Dtype> {
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
 template <typename Dtype>
-class SoftmaxLayer : public Layer<Dtype> {
- public:
-  explicit SoftmaxLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Softmax"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int outer_num_;
-  int inner_num_;
-  int softmax_axis_;
-  /// sum_multiplier is used to carry out sum using BLAS
-  Blob<Dtype> sum_multiplier_;
-  /// scale is an intermediate Blob to hold temporary results.
-  Blob<Dtype> scale_;
+class SoftmaxLayer: public Layer<Dtype> {
+  public:
+    explicit SoftmaxLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    ~SoftmaxLayer();
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Softmax";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int outer_num_;
+    int inner_num_;
+    int softmax_axis_;
+    /// sum_multiplier is used to carry out sum using BLAS
+    Blob<Dtype> sum_multiplier_;
+    /// scale is an intermediate Blob to hold temporary results.
+    Blob<Dtype> scale_;
 };
 
 #ifdef USE_CUDNN
@@ -516,23 +603,23 @@ class SoftmaxLayer : public Layer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
- public:
+  public:
   explicit CuDNNSoftmaxLayer(const LayerParameter& param)
-      : SoftmaxLayer<Dtype>(param), handles_setup_(false) {}
+  : SoftmaxLayer<Dtype>(param), handles_setup_(false) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual ~CuDNNSoftmaxLayer();
 
- protected:
+  protected:
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
-  cudnnHandle_t             handle_;
+  cudnnHandle_t handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
 };
@@ -545,28 +632,36 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
 template <typename Dtype>
-class SplitLayer : public Layer<Dtype> {
- public:
-  explicit SplitLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Split"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int count_;
+class SplitLayer: public Layer<Dtype> {
+  public:
+    explicit SplitLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Split";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int count_;
+    cl_kernel gpu_add_kernel;
 };
 
 /**
@@ -576,34 +671,41 @@ class SplitLayer : public Layer<Dtype> {
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
 template <typename Dtype>
-class SliceLayer : public Layer<Dtype> {
- public:
-  explicit SliceLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Slice"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 2; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int count_;
-  int num_slices_;
-  int slice_size_;
-  int slice_axis_;
-  vector<int> slice_point_;
+class SliceLayer: public Layer<Dtype> {
+  public:
+    explicit SliceLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Slice";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 2;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int count_;
+    int num_slices_;
+    int slice_size_;
+    int slice_axis_;
+    vector<int> slice_point_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
index 3958cb7e..d4f526b3 100644
--- a/include/caffe/data_layers.hpp
+++ b/include/caffe/data_layers.hpp
@@ -25,78 +25,95 @@ namespace caffe {
  * TODO(dox): thorough documentation for Forward and proto params.
  */
 template <typename Dtype>
-class BaseDataLayer : public Layer<Dtype> {
- public:
-  explicit BaseDataLayer(const LayerParameter& param);
-  // LayerSetUp: implements common data layer setup functionality, and calls
-  // DataLayerSetUp to do special data layer setup for individual layer types.
-  // This method may not be overridden except by the BasePrefetchingDataLayer.
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-
- protected:
-  TransformationParameter transform_param_;
-  shared_ptr<DataTransformer<Dtype> > data_transformer_;
-  bool output_labels_;
+class BaseDataLayer: public Layer<Dtype> {
+  public:
+    explicit BaseDataLayer(const LayerParameter& param);
+    // LayerSetUp: implements common data layer setup functionality, and calls
+    // DataLayerSetUp to do special data layer setup for individual layer types.
+    // This method may not be overridden except by the BasePrefetchingDataLayer.
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+    // Data layers have no bottoms, so reshaping is trivial.
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+
+  protected:
+    TransformationParameter transform_param_;
+    shared_ptr<DataTransformer<Dtype> > data_transformer_;
+    bool output_labels_;
 };
 
 template <typename Dtype>
-class BasePrefetchingDataLayer :
-    public BaseDataLayer<Dtype>, public InternalThread {
- public:
-  explicit BasePrefetchingDataLayer(const LayerParameter& param)
-      : BaseDataLayer<Dtype>(param) {}
-  // LayerSetUp: implements common data layer setup functionality, and calls
-  // DataLayerSetUp to do special data layer setup for individual layer types.
-  // This method may not be overridden.
-  void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual void CreatePrefetchThread();
-  virtual void JoinPrefetchThread();
-  // The thread's function
-  virtual void InternalThreadEntry() {}
-
- protected:
-  Blob<Dtype> prefetch_data_;
-  Blob<Dtype> prefetch_label_;
-  Blob<Dtype> transformed_data_;
+class BasePrefetchingDataLayer: public BaseDataLayer<Dtype>,
+    public InternalThread {
+  public:
+    explicit BasePrefetchingDataLayer(const LayerParameter& param)
+        : BaseDataLayer<Dtype>(param) {
+    }
+    // LayerSetUp: implements common data layer setup functionality, and calls
+    // DataLayerSetUp to do special data layer setup for individual layer types.
+    // This method may not be overridden.
+    void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual void CreatePrefetchThread();
+    virtual void JoinPrefetchThread();
+    // The thread's function
+    virtual void InternalThreadEntry() {
+    }
+
+  protected:
+    Blob<Dtype> prefetch_data_;
+    Blob<Dtype> prefetch_label_;
+    Blob<Dtype> transformed_data_;
 };
 
 template <typename Dtype>
-class DataLayer : public BasePrefetchingDataLayer<Dtype> {
- public:
-  explicit DataLayer(const LayerParameter& param)
-      : BasePrefetchingDataLayer<Dtype>(param) {}
-  virtual ~DataLayer();
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Data"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  virtual inline int MaxTopBlobs() const { return 2; }
-
- protected:
-  virtual void InternalThreadEntry();
-
-  shared_ptr<db::DB> db_;
-  shared_ptr<db::Cursor> cursor_;
+class DataLayer: public BasePrefetchingDataLayer<Dtype> {
+  public:
+    explicit DataLayer(const LayerParameter& param)
+        : BasePrefetchingDataLayer<Dtype>(param) {
+    }
+    virtual ~DataLayer();
+    virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Data";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+    virtual inline int MaxTopBlobs() const {
+      return 2;
+    }
+
+  protected:
+    virtual void InternalThreadEntry();
+
+    shared_ptr<db::DB> db_;
+    shared_ptr<db::Cursor> cursor_;
 };
 
 /**
@@ -105,30 +122,42 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
  * TODO(dox): thorough documentation for Forward and proto params.
  */
 template <typename Dtype>
-class DummyDataLayer : public Layer<Dtype> {
- public:
-  explicit DummyDataLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual inline const char* type() const { return "DummyData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-
-  vector<shared_ptr<Filler<Dtype> > > fillers_;
-  vector<bool> refill_;
+class DummyDataLayer: public Layer<Dtype> {
+  public:
+    explicit DummyDataLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    // Data layers have no bottoms, so reshaping is trivial.
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+
+    virtual inline const char* type() const {
+      return "DummyData";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+
+    vector<shared_ptr<Filler<Dtype> > > fillers_;
+    vector<bool> refill_;
 };
 
 /**
@@ -137,39 +166,51 @@ class DummyDataLayer : public Layer<Dtype> {
  * TODO(dox): thorough documentation for Forward and proto params.
  */
 template <typename Dtype>
-class HDF5DataLayer : public Layer<Dtype> {
- public:
-  explicit HDF5DataLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual ~HDF5DataLayer();
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual inline const char* type() const { return "HDF5Data"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int MinTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
-  virtual void LoadHDF5FileData(const char* filename);
-
-  std::vector<std::string> hdf_filenames_;
-  unsigned int num_files_;
-  unsigned int current_file_;
-  hsize_t current_row_;
-  std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
-  std::vector<unsigned int> data_permutation_;
-  std::vector<unsigned int> file_permutation_;
+class HDF5DataLayer: public Layer<Dtype> {
+  public:
+    explicit HDF5DataLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual ~HDF5DataLayer();
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    // Data layers have no bottoms, so reshaping is trivial.
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+
+    virtual inline const char* type() const {
+      return "HDF5Data";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+    }
+    virtual void LoadHDF5FileData(const char* filename);
+
+    std::vector<std::string> hdf_filenames_;
+    unsigned int num_files_;
+    unsigned int current_file_;
+    hsize_t current_row_;
+    std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
+    std::vector<unsigned int> data_permutation_;
+    std::vector<unsigned int> file_permutation_;
 };
 
 /**
@@ -178,40 +219,50 @@ class HDF5DataLayer : public Layer<Dtype> {
  * TODO(dox): thorough documentation for Forward and proto params.
  */
 template <typename Dtype>
-class HDF5OutputLayer : public Layer<Dtype> {
- public:
-  explicit HDF5OutputLayer(const LayerParameter& param)
-      : Layer<Dtype>(param), file_opened_(false) {}
-  virtual ~HDF5OutputLayer();
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  // Data layers have no bottoms, so reshaping is trivial.
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  virtual inline const char* type() const { return "HDF5Output"; }
-  // TODO: no limit on the number of blobs
-  virtual inline int ExactNumBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 0; }
-
-  inline std::string file_name() const { return file_name_; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void SaveBlobs();
-
-  bool file_opened_;
-  std::string file_name_;
-  hid_t file_id_;
-  Blob<Dtype> data_blob_;
-  Blob<Dtype> label_blob_;
+class HDF5OutputLayer: public Layer<Dtype> {
+  public:
+    explicit HDF5OutputLayer(const LayerParameter& param)
+        : Layer<Dtype>(param), file_opened_(false) {
+    }
+    virtual ~HDF5OutputLayer();
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    // Data layers have no bottoms, so reshaping is trivial.
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
+
+    virtual inline const char* type() const {
+      return "HDF5Output";
+    }
+    // TODO: no limit on the number of blobs
+    virtual inline int ExactNumBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 0;
+    }
+
+    inline std::string file_name() const {
+      return file_name_;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void SaveBlobs();
+
+    bool file_opened_;
+    std::string file_name_;
+    hid_t file_id_;
+    Blob<Dtype> data_blob_;
+    Blob<Dtype> label_blob_;
 };
 
 /**
@@ -220,25 +271,32 @@ class HDF5OutputLayer : public Layer<Dtype> {
  * TODO(dox): thorough documentation for Forward and proto params.
  */
 template <typename Dtype>
-class ImageDataLayer : public BasePrefetchingDataLayer<Dtype> {
- public:
-  explicit ImageDataLayer(const LayerParameter& param)
-      : BasePrefetchingDataLayer<Dtype>(param) {}
-  virtual ~ImageDataLayer();
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "ImageData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int ExactNumTopBlobs() const { return 2; }
-
- protected:
-  shared_ptr<Caffe::RNG> prefetch_rng_;
-  virtual void ShuffleImages();
-  virtual void InternalThreadEntry();
-
-  vector<std::pair<std::string, int> > lines_;
-  int lines_id_;
+class ImageDataLayer: public BasePrefetchingDataLayer<Dtype> {
+  public:
+    explicit ImageDataLayer(const LayerParameter& param)
+        : BasePrefetchingDataLayer<Dtype>(param) {
+    }
+    virtual ~ImageDataLayer();
+    virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "ImageData";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 2;
+    }
+
+  protected:
+    shared_ptr<Caffe::RNG> prefetch_rng_;
+    virtual void ShuffleImages();
+    virtual void InternalThreadEntry();
+
+    vector<std::pair<std::string, int> > lines_;
+    int lines_id_;
 };
 
 /**
@@ -247,43 +305,58 @@ class ImageDataLayer : public BasePrefetchingDataLayer<Dtype> {
  * TODO(dox): thorough documentation for Forward and proto params.
  */
 template <typename Dtype>
-class MemoryDataLayer : public BaseDataLayer<Dtype> {
- public:
-  explicit MemoryDataLayer(const LayerParameter& param)
-      : BaseDataLayer<Dtype>(param), has_new_data_(false) {}
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "MemoryData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int ExactNumTopBlobs() const { return 2; }
-
-  virtual void AddDatumVector(const vector<Datum>& datum_vector);
-  virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
-      const vector<int>& labels);
-
-  // Reset should accept const pointers, but can't, because the memory
-  //  will be given to Blob, which is mutable
-  void Reset(Dtype* data, Dtype* label, int n);
-  void set_batch_size(int new_size);
-
-  int batch_size() { return batch_size_; }
-  int channels() { return channels_; }
-  int height() { return height_; }
-  int width() { return width_; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  int batch_size_, channels_, height_, width_, size_;
-  Dtype* data_;
-  Dtype* labels_;
-  int n_;
-  size_t pos_;
-  Blob<Dtype> added_data_;
-  Blob<Dtype> added_label_;
-  bool has_new_data_;
+class MemoryDataLayer: public BaseDataLayer<Dtype> {
+  public:
+    explicit MemoryDataLayer(const LayerParameter& param)
+        : BaseDataLayer<Dtype>(param), has_new_data_(false) {
+    }
+    virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "MemoryData";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 2;
+    }
+
+    virtual void AddDatumVector(const vector<Datum>& datum_vector);
+    virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
+        const vector<int>& labels);
+
+    // Reset should accept const pointers, but can't, because the memory
+    //  will be given to Blob, which is mutable
+    void Reset(Dtype* data, Dtype* label, int n);
+    void set_batch_size(int new_size);
+
+    int batch_size() {
+      return batch_size_;
+    }
+    int channels() {
+      return channels_;
+    }
+    int height() {
+      return height_;
+    }
+    int width() {
+      return width_;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    int batch_size_, channels_, height_, width_, size_;
+    Dtype* data_;
+    Dtype* labels_;
+    int n_;
+    size_t pos_;
+    Blob<Dtype> added_data_;
+    Blob<Dtype> added_label_;
+    bool has_new_data_;
 };
 
 /**
@@ -293,33 +366,42 @@ class MemoryDataLayer : public BaseDataLayer<Dtype> {
  * TODO(dox): thorough documentation for Forward and proto params.
  */
 template <typename Dtype>
-class WindowDataLayer : public BasePrefetchingDataLayer<Dtype> {
- public:
-  explicit WindowDataLayer(const LayerParameter& param)
-      : BasePrefetchingDataLayer<Dtype>(param) {}
-  virtual ~WindowDataLayer();
-  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "WindowData"; }
-  virtual inline int ExactNumBottomBlobs() const { return 0; }
-  virtual inline int ExactNumTopBlobs() const { return 2; }
-
- protected:
-  virtual unsigned int PrefetchRand();
-  virtual void InternalThreadEntry();
-
-  shared_ptr<Caffe::RNG> prefetch_rng_;
-  vector<std::pair<std::string, vector<int> > > image_database_;
-  enum WindowField { IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM };
-  vector<vector<float> > fg_windows_;
-  vector<vector<float> > bg_windows_;
-  Blob<Dtype> data_mean_;
-  vector<Dtype> mean_values_;
-  bool has_mean_file_;
-  bool has_mean_values_;
-  bool cache_images_;
-  vector<std::pair<std::string, Datum > > image_database_cache_;
+class WindowDataLayer: public BasePrefetchingDataLayer<Dtype> {
+  public:
+    explicit WindowDataLayer(const LayerParameter& param)
+        : BasePrefetchingDataLayer<Dtype>(param) {
+    }
+    virtual ~WindowDataLayer();
+    virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "WindowData";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 0;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 2;
+    }
+
+  protected:
+    virtual unsigned int PrefetchRand();
+    virtual void InternalThreadEntry();
+
+    shared_ptr<Caffe::RNG> prefetch_rng_;
+    vector<std::pair<std::string, vector<int> > > image_database_;
+    enum WindowField {
+      IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM
+    };
+    vector<vector<float> > fg_windows_;
+    vector<vector<float> > bg_windows_;
+    Blob<Dtype> data_mean_;
+    vector<Dtype> mean_values_;
+    bool has_mean_file_;
+    bool has_mean_values_;
+    bool cache_images_;
+    vector<std::pair<std::string, Datum> > image_database_cache_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
index 0ad68c80..daa4eee0 100644
--- a/include/caffe/data_transformer.hpp
+++ b/include/caffe/data_transformer.hpp
@@ -15,134 +15,134 @@ namespace caffe {
  */
 template <typename Dtype>
 class DataTransformer {
- public:
-  explicit DataTransformer(const TransformationParameter& param, Phase phase);
-  virtual ~DataTransformer() {}
-
-  /**
-   * @brief Initialize the Random number generations if needed by the
-   *    transformation.
-   */
-  void InitRand();
-
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to the data.
-   *
-   * @param datum
-   *    Datum containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See data_layer.cpp for an example.
-   */
-  void Transform(const Datum& datum, Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to a vector of Datum.
-   *
-   * @param datum_vector
-   *    A vector of Datum containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See memory_layer.cpp for an example.
-   */
-  void Transform(const vector<Datum> & datum_vector,
-                Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to a vector of Mat.
-   *
-   * @param mat_vector
-   *    A vector of Mat containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See memory_layer.cpp for an example.
-   */
-  void Transform(const vector<cv::Mat> & mat_vector,
-                Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Applies the transformation defined in the data layer's
-   * transform_param block to a cv::Mat
-   *
-   * @param cv_img
-   *    cv::Mat containing the data to be transformed.
-   * @param transformed_blob
-   *    This is destination blob. It can be part of top blob's data if
-   *    set_cpu_data() is used. See image_data_layer.cpp for an example.
-   */
-  void Transform(const cv::Mat& cv_img, Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Applies the same transformation defined in the data layer's
-   * transform_param block to all the num images in a input_blob.
-   *
-   * @param input_blob
-   *    A Blob containing the data to be transformed. It applies the same
-   *    transformation to all the num images in the blob.
-   * @param transformed_blob
-   *    This is destination blob, it will contain as many images as the
-   *    input blob. It can be part of top blob's data.
-   */
-  void Transform(Blob<Dtype>* input_blob, Blob<Dtype>* transformed_blob);
-
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *
-   * @param datum
-   *    Datum containing the data to be transformed.
-   */
-  vector<int> InferBlobShape(const Datum& datum);
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *    It uses the first element to infer the shape of the blob.
-   *
-   * @param datum_vector
-   *    A vector of Datum containing the data to be transformed.
-   */
-  vector<int> InferBlobShape(const vector<Datum> & datum_vector);
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *    It uses the first element to infer the shape of the blob.
-   *
-   * @param mat_vector
-   *    A vector of Mat containing the data to be transformed.
-   */
-  vector<int> InferBlobShape(const vector<cv::Mat> & mat_vector);
-  /**
-   * @brief Infers the shape of transformed_blob will have when
-   *    the transformation is applied to the data.
-   *
-   * @param cv_img
-   *    cv::Mat containing the data to be transformed.
-   */
-  vector<int> InferBlobShape(const cv::Mat& cv_img);
-
- protected:
-   /**
-   * @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
-   *
-   * @param n
-   *    The upperbound (exclusive) value of the random number.
-   * @return
-   *    A uniformly random integer value from ({0, 1, ..., n-1}).
-   */
-  virtual int Rand(int n);
-
-  void Transform(const Datum& datum, Dtype* transformed_data);
-  // Tranformation parameters
-  TransformationParameter param_;
-
-
-  shared_ptr<Caffe::RNG> rng_;
-  Phase phase_;
-  Blob<Dtype> data_mean_;
-  vector<Dtype> mean_values_;
+  public:
+    explicit DataTransformer(const TransformationParameter& param, Phase phase);
+    virtual ~DataTransformer() {
+    }
+
+    /**
+     * @brief Initialize the Random number generations if needed by the
+     *    transformation.
+     */
+    void InitRand();
+
+    /**
+     * @brief Applies the transformation defined in the data layer's
+     * transform_param block to the data.
+     *
+     * @param datum
+     *    Datum containing the data to be transformed.
+     * @param transformed_blob
+     *    This is destination blob. It can be part of top blob's data if
+     *    set_cpu_data() is used. See data_layer.cpp for an example.
+     */
+    void Transform(const Datum& datum, Blob<Dtype>* transformed_blob);
+
+    /**
+     * @brief Applies the transformation defined in the data layer's
+     * transform_param block to a vector of Datum.
+     *
+     * @param datum_vector
+     *    A vector of Datum containing the data to be transformed.
+     * @param transformed_blob
+     *    This is destination blob. It can be part of top blob's data if
+     *    set_cpu_data() is used. See memory_layer.cpp for an example.
+     */
+    void Transform(const vector<Datum> & datum_vector,
+        Blob<Dtype>* transformed_blob);
+
+    /**
+     * @brief Applies the transformation defined in the data layer's
+     * transform_param block to a vector of Mat.
+     *
+     * @param mat_vector
+     *    A vector of Mat containing the data to be transformed.
+     * @param transformed_blob
+     *    This is destination blob. It can be part of top blob's data if
+     *    set_cpu_data() is used. See memory_layer.cpp for an example.
+     */
+    void Transform(const vector<cv::Mat> & mat_vector,
+        Blob<Dtype>* transformed_blob);
+
+    /**
+     * @brief Applies the transformation defined in the data layer's
+     * transform_param block to a cv::Mat
+     *
+     * @param cv_img
+     *    cv::Mat containing the data to be transformed.
+     * @param transformed_blob
+     *    This is destination blob. It can be part of top blob's data if
+     *    set_cpu_data() is used. See image_data_layer.cpp for an example.
+     */
+    void Transform(const cv::Mat& cv_img, Blob<Dtype>* transformed_blob);
+
+    /**
+     * @brief Applies the same transformation defined in the data layer's
+     * transform_param block to all the num images in a input_blob.
+     *
+     * @param input_blob
+     *    A Blob containing the data to be transformed. It applies the same
+     *    transformation to all the num images in the blob.
+     * @param transformed_blob
+     *    This is destination blob, it will contain as many images as the
+     *    input blob. It can be part of top blob's data.
+     */
+    void Transform(Blob<Dtype>* input_blob, Blob<Dtype>* transformed_blob);
+
+    /**
+     * @brief Infers the shape of transformed_blob will have when
+     *    the transformation is applied to the data.
+     *
+     * @param datum
+     *    Datum containing the data to be transformed.
+     */
+    vector<int> InferBlobShape(const Datum& datum);
+    /**
+     * @brief Infers the shape of transformed_blob will have when
+     *    the transformation is applied to the data.
+     *    It uses the first element to infer the shape of the blob.
+     *
+     * @param datum_vector
+     *    A vector of Datum containing the data to be transformed.
+     */
+    vector<int> InferBlobShape(const vector<Datum> & datum_vector);
+    /**
+     * @brief Infers the shape of transformed_blob will have when
+     *    the transformation is applied to the data.
+     *    It uses the first element to infer the shape of the blob.
+     *
+     * @param mat_vector
+     *    A vector of Mat containing the data to be transformed.
+     */
+    vector<int> InferBlobShape(const vector<cv::Mat> & mat_vector);
+    /**
+     * @brief Infers the shape of transformed_blob will have when
+     *    the transformation is applied to the data.
+     *
+     * @param cv_img
+     *    cv::Mat containing the data to be transformed.
+     */
+    vector<int> InferBlobShape(const cv::Mat& cv_img);
+
+  protected:
+    /**
+     * @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
+     *
+     * @param n
+     *    The upperbound (exclusive) value of the random number.
+     * @return
+     *    A uniformly random integer value from ({0, 1, ..., n-1}).
+     */
+    virtual int Rand(int n);
+
+    void Transform(const Datum& datum, Dtype* transformed_data);
+    // Tranformation parameters
+    TransformationParameter param_;
+
+    shared_ptr<Caffe::RNG> rng_;
+    Phase phase_;
+    Blob<Dtype> data_mean_;
+    vector<Dtype> mean_values_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp
new file mode 100644
index 00000000..b6190f28
--- /dev/null
+++ b/include/caffe/device.hpp
@@ -0,0 +1,86 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+#ifndef CAFFE_DEVICE_HPP
+#define CAFFE_DEVICE_HPP
+#include <string>
+#include <fstream>
+#include "caffe/common.hpp"
+namespace caffe {
+#ifndef CPU_ONLY
+class Device {
+  public:
+    Device()
+        : numPlatforms(0), numDevices(0), device_id(INT_MIN) {
+    }
+    ~Device();
+    cl_uint numPlatforms;
+    cl_platform_id * platformIDs;
+    char platformName[64];
+    char openclVersion[64];
+    cl_uint numDevices;
+    cl_device_id * DeviceIDs;
+
+    cl_context Context;
+    cl_command_queue CommandQueue;
+    cl_command_queue CommandQueue_helper;
+    cl_program Program;
+    cl_device_id * pDevices;
+    int device_id;
+
+    clblasOrder col;
+    clblasOrder row;
+    std::map<std::string, cl_kernel> Kernels;
+
+    cl_int Init(int device_id = -1);
+    cl_int ConvertToString(std::string pFileName, std::string &Str);
+    void DisplayPlatformInfo();
+    void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str);
+
+    void GetDeviceInfo();
+    void DeviceQuery();
+    int GetDevice() {
+      return device_id;
+    }
+    ;
+    void BuildProgram(std::string kernel_dir);
+
+    template <typename T>
+    void DisplayDeviceInfo(cl_device_id id, cl_device_info name,
+        std::string str);
+    template <typename T>
+    void appendBitfield(T info, T value, std::string name, std::string &str);
+
+    cl_kernel GetKernel(std::string kernel_name);
+    void ReleaseKernels();
+};
+extern std::string buildOption;
+extern Device amdDevice;
+#endif
+}  // namespace caffe
+
+#endif //CAFFE_DEVICE_HPP
+
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 888f4a4b..ab9d6b39 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -18,111 +18,119 @@ namespace caffe {
 /// @brief Fills a Blob with constant or randomly-generated data.
 template <typename Dtype>
 class Filler {
- public:
-  explicit Filler(const FillerParameter& param) : filler_param_(param) {}
-  virtual ~Filler() {}
-  virtual void Fill(Blob<Dtype>* blob) = 0;
- protected:
-  FillerParameter filler_param_;
-};  // class Filler
-
+  public:
+    explicit Filler(const FillerParameter& param)
+        : filler_param_(param) {
+    }
+    virtual ~Filler() {
+    }
+    virtual void Fill(Blob<Dtype>* blob) = 0;
+  protected:
+    FillerParameter filler_param_;
+};
+// class Filler
 
 /// @brief Fills a Blob with constant values @f$ x = 0 @f$.
 template <typename Dtype>
-class ConstantFiller : public Filler<Dtype> {
- public:
-  explicit ConstantFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    Dtype* data = blob->mutable_cpu_data();
-    const int count = blob->count();
-    const Dtype value = this->filler_param_.value();
-    CHECK(count);
-    for (int i = 0; i < count; ++i) {
-      data[i] = value;
+class ConstantFiller: public Filler<Dtype> {
+  public:
+    explicit ConstantFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      Dtype* data = blob->mutable_cpu_data();
+      const int count = blob->count();
+      const Dtype value = this->filler_param_.value();
+      CHECK(count);
+      for (int i = 0; i < count; ++i) {
+        data[i] = value;
+      }
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
     }
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
 };
 
 /// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$.
 template <typename Dtype>
-class UniformFiller : public Filler<Dtype> {
- public:
-  explicit UniformFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    CHECK(blob->count());
-    caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),
-        Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
+class UniformFiller: public Filler<Dtype> {
+  public:
+    explicit UniformFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      CHECK(blob->count());
+      caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),
+          Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
+    }
 };
 
 /// @brief Fills a Blob with Gaussian-distributed values @f$ x = a @f$.
 template <typename Dtype>
-class GaussianFiller : public Filler<Dtype> {
- public:
-  explicit GaussianFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    Dtype* data = blob->mutable_cpu_data();
-    CHECK(blob->count());
-    caffe_rng_gaussian<Dtype>(blob->count(), Dtype(this->filler_param_.mean()),
-        Dtype(this->filler_param_.std()), blob->mutable_cpu_data());
-    int sparse = this->filler_param_.sparse();
-    CHECK_GE(sparse, -1);
-    if (sparse >= 0) {
-      // Sparse initialization is implemented for "weight" blobs; i.e. matrices.
-      // These have num == channels == 1; width is number of inputs; height is
-      // number of outputs.  The 'sparse' variable specifies the mean number
-      // of non-zero input weights for a given output.
-      CHECK_GE(blob->num_axes(), 1);
-      const int num_outputs = blob->shape(0);
-      Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs);
-      rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int)));
-      int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());
-      caffe_rng_bernoulli(blob->count(), non_zero_probability, mask);
-      for (int i = 0; i < blob->count(); ++i) {
-        data[i] *= mask[i];
+class GaussianFiller: public Filler<Dtype> {
+  public:
+    explicit GaussianFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      Dtype* data = blob->mutable_cpu_data();
+      CHECK(blob->count());
+      caffe_rng_gaussian<Dtype>(blob->count(),
+          Dtype(this->filler_param_.mean()), Dtype(this->filler_param_.std()),
+          blob->mutable_cpu_data());
+      int sparse = this->filler_param_.sparse();
+      CHECK_GE(sparse, -1);
+      if (sparse >= 0) {
+        // Sparse initialization is implemented for "weight" blobs; i.e. matrices.
+        // These have num == channels == 1; width is number of inputs; height is
+        // number of outputs.  The 'sparse' variable specifies the mean number
+        // of non-zero input weights for a given output.
+        CHECK_GE(blob->num_axes(), 1);
+        const int num_outputs = blob->shape(0);
+        Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs);
+        rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int)));
+        int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());
+        caffe_rng_bernoulli(blob->count(), non_zero_probability, mask);
+        for (int i = 0; i < blob->count(); ++i) {
+          data[i] *= mask[i];
+        }
       }
     }
-  }
 
- protected:
-  shared_ptr<SyncedMemory> rand_vec_;
+  protected:
+    shared_ptr<SyncedMemory> rand_vec_;
 };
 
 /** @brief Fills a Blob with values @f$ x \in [0, 1] @f$
  *         such that @f$ \forall i \sum_j x_{ij} = 1 @f$.
  */
 template <typename Dtype>
-class PositiveUnitballFiller : public Filler<Dtype> {
- public:
-  explicit PositiveUnitballFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    Dtype* data = blob->mutable_cpu_data();
-    DCHECK(blob->count());
-    caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
-    // We expect the filler to not be called very frequently, so we will
-    // just use a simple implementation
-    int dim = blob->count() / blob->num();
-    CHECK(dim);
-    for (int i = 0; i < blob->num(); ++i) {
-      Dtype sum = 0;
-      for (int j = 0; j < dim; ++j) {
-        sum += data[i * dim + j];
-      }
-      for (int j = 0; j < dim; ++j) {
-        data[i * dim + j] /= sum;
+class PositiveUnitballFiller: public Filler<Dtype> {
+  public:
+    explicit PositiveUnitballFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      Dtype* data = blob->mutable_cpu_data();
+      DCHECK(blob->count());
+      caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
+      // We expect the filler to not be called very frequently, so we will
+      // just use a simple implementation
+      int dim = blob->count() / blob->num();
+      CHECK(dim);
+      for (int i = 0; i < blob->num(); ++i) {
+        Dtype sum = 0;
+        for (int j = 0; j < dim; ++j) {
+          sum += data[i * dim + j];
+        }
+        for (int j = 0; j < dim; ++j) {
+          data[i * dim + j] /= sum;
+        }
       }
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
     }
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
 };
 
 /**
@@ -142,28 +150,29 @@ class PositiveUnitballFiller : public Filler<Dtype> {
  * TODO(dox): make notation in above comment consistent with rest & use LaTeX.
  */
 template <typename Dtype>
-class XavierFiller : public Filler<Dtype> {
- public:
-  explicit XavierFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    CHECK(blob->count());
-    int fan_in = blob->count() / blob->num();
-    int fan_out = blob->count() / blob->channels();
-    Dtype n = fan_in;  // default to fan_in
-    if (this->filler_param_.variance_norm() ==
-        FillerParameter_VarianceNorm_AVERAGE) {
-      n = (fan_in + fan_out) / Dtype(2);
-    } else if (this->filler_param_.variance_norm() ==
-        FillerParameter_VarianceNorm_FAN_OUT) {
-      n = fan_out;
+class XavierFiller: public Filler<Dtype> {
+  public:
+    explicit XavierFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      CHECK(blob->count());
+      int fan_in = blob->count() / blob->num();
+      int fan_out = blob->count() / blob->channels();
+      Dtype n = fan_in;  // default to fan_in
+      if (this->filler_param_.variance_norm()
+          == FillerParameter_VarianceNorm_AVERAGE) {
+        n = (fan_in + fan_out) / Dtype(2);
+      } else if (this->filler_param_.variance_norm()
+          == FillerParameter_VarianceNorm_FAN_OUT) {
+        n = fan_out;
+      }
+      Dtype scale = sqrt(Dtype(3) / n);
+      caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,
+          blob->mutable_cpu_data());
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
     }
-    Dtype scale = sqrt(Dtype(3) / n);
-    caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,
-        blob->mutable_cpu_data());
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
 };
 
 /**
@@ -184,82 +193,84 @@ class XavierFiller : public Filler<Dtype> {
  * is currently not the case for inner product layers.
  */
 template <typename Dtype>
-class MSRAFiller : public Filler<Dtype> {
- public:
-  explicit MSRAFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    CHECK(blob->count());
-    int fan_in = blob->count() / blob->num();
-    int fan_out = blob->count() / blob->channels();
-    Dtype n = fan_in;  // default to fan_in
-    if (this->filler_param_.variance_norm() ==
-        FillerParameter_VarianceNorm_AVERAGE) {
-      n = (fan_in + fan_out) / Dtype(2);
-    } else if (this->filler_param_.variance_norm() ==
-        FillerParameter_VarianceNorm_FAN_OUT) {
-      n = fan_out;
+class MSRAFiller: public Filler<Dtype> {
+  public:
+    explicit MSRAFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      CHECK(blob->count());
+      int fan_in = blob->count() / blob->num();
+      int fan_out = blob->count() / blob->channels();
+      Dtype n = fan_in;  // default to fan_in
+      if (this->filler_param_.variance_norm()
+          == FillerParameter_VarianceNorm_AVERAGE) {
+        n = (fan_in + fan_out) / Dtype(2);
+      } else if (this->filler_param_.variance_norm()
+          == FillerParameter_VarianceNorm_FAN_OUT) {
+        n = fan_out;
+      }
+      Dtype std = sqrt(Dtype(2) / n);
+      caffe_rng_gaussian<Dtype>(blob->count(), Dtype(0), std,
+          blob->mutable_cpu_data());
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
     }
-    Dtype std = sqrt(Dtype(2) / n);
-    caffe_rng_gaussian<Dtype>(blob->count(), Dtype(0), std,
-        blob->mutable_cpu_data());
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
 };
 
 /*!
-@brief Fills a Blob with coefficients for bilinear interpolation.
+ @brief Fills a Blob with coefficients for bilinear interpolation.
 
-A common use case is with the DeconvolutionLayer acting as upsampling.
-You can upsample a feature map with shape of (B, C, H, W) by any integer factor
-using the following proto.
-\code
-layer {
-  name: "upsample", type: "Deconvolution"
-  bottom: "{{bottom_name}}" top: "{{top_name}}"
-  convolution_param {
-    kernel_size: {{2 * factor - factor % 2}} stride: {{factor}}
-    num_output: {{C}} group: {{C}}
-    pad: {{ceil((factor - 1) / 2.)}}
-    weight_filler: { type: "bilinear" } bias_term: false
-  }
-  param { lr_mult: 0 decay_mult: 0 }
-}
-\endcode
-Please use this by replacing `{{}}` with your values. By specifying
-`num_output: {{C}} group: {{C}}`, it behaves as
-channel-wise convolution. The filter shape of this deconvolution layer will be
-(C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K)
-interpolation kernel for every channel of the filter identically. The resulting
-shape of the top feature map will be (B, C, factor * H, factor * W).
-Note that the learning rate and the
-weight decay are set to 0 in order to keep coefficient values of bilinear
-interpolation unchanged during training. If you apply this to an image, this
-operation is equivalent to the following call in Python with Scikit.Image.
-\code{.py}
-out = skimage.transform.rescale(img, factor, mode='constant', cval=0)
-\endcode
+ A common use case is with the DeconvolutionLayer acting as upsampling.
+ You can upsample a feature map with shape of (B, C, H, W) by any integer factor
+ using the following proto.
+ \code
+ layer {
+ name: "upsample", type: "Deconvolution"
+ bottom: "{{bottom_name}}" top: "{{top_name}}"
+ convolution_param {
+ kernel_size: {{2 * factor - factor % 2}} stride: {{factor}}
+ num_output: {{C}} group: {{C}}
+ pad: {{ceil((factor - 1) / 2.)}}
+ weight_filler: { type: "bilinear" } bias_term: false
+ }
+ param { lr_mult: 0 decay_mult: 0 }
+ }
+ \endcode
+ Please use this by replacing `{{}}` with your values. By specifying
+ `num_output: {{C}} group: {{C}}`, it behaves as
+ channel-wise convolution. The filter shape of this deconvolution layer will be
+ (C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K)
+ interpolation kernel for every channel of the filter identically. The resulting
+ shape of the top feature map will be (B, C, factor * H, factor * W).
+ Note that the learning rate and the
+ weight decay are set to 0 in order to keep coefficient values of bilinear
+ interpolation unchanged during training. If you apply this to an image, this
+ operation is equivalent to the following call in Python with Scikit.Image.
+ \code{.py}
+ out = skimage.transform.rescale(img, factor, mode='constant', cval=0)
+ \endcode
  */
 template <typename Dtype>
-class BilinearFiller : public Filler<Dtype> {
- public:
-  explicit BilinearFiller(const FillerParameter& param)
-      : Filler<Dtype>(param) {}
-  virtual void Fill(Blob<Dtype>* blob) {
-    CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";
-    CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
-    Dtype* data = blob->mutable_cpu_data();
-    int f = ceil(blob->width() / 2.);
-    float c = (2 * f - 1 - f % 2) / (2. * f);
-    for (int i = 0; i < blob->count(); ++i) {
-      float x = i % blob->width();
-      float y = (i / blob->width()) % blob->height();
-      data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
+class BilinearFiller: public Filler<Dtype> {
+  public:
+    explicit BilinearFiller(const FillerParameter& param)
+        : Filler<Dtype>(param) {
+    }
+    virtual void Fill(Blob<Dtype>* blob) {
+      CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";
+      CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
+      Dtype* data = blob->mutable_cpu_data();
+      int f = ceil(blob->width() / 2.);
+      float c = (2 * f - 1 - f % 2) / (2. * f);
+      for (int i = 0; i < blob->count(); ++i) {
+        float x = i % blob->width();
+        float y = (i / blob->width()) % blob->height();
+        data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
+      }
+      CHECK_EQ(this->filler_param_.sparse(), -1)
+          << "Sparsity not supported by this Filler.";
     }
-    CHECK_EQ(this->filler_param_.sparse(), -1)
-         << "Sparsity not supported by this Filler.";
-  }
 };
 
 /**
@@ -288,7 +299,7 @@ Filler<Dtype>* GetFiller(const FillerParameter& param) {
   } else {
     CHECK(false) << "Unknown filler name: " << param.type();
   }
-  return (Filler<Dtype>*)(NULL);
+  return (Filler<Dtype>*) (NULL);
 }
 
 }  // namespace caffe
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
index 815ca546..dd8ae8bf 100644
--- a/include/caffe/internal_thread.hpp
+++ b/include/caffe/internal_thread.hpp
@@ -7,7 +7,9 @@
  Forward declare boost::thread instead of including boost/thread.hpp
  to avoid a boost/NVCC issues (#1009, #1010) on OSX.
  */
-namespace boost { class thread; }
+namespace boost {
+class thread;
+}
 
 namespace caffe {
 
@@ -17,24 +19,27 @@ namespace caffe {
  * by reimplementing the virutal function InternalThreadEntry.
  */
 class InternalThread {
- public:
-  InternalThread() : thread_() {}
-  virtual ~InternalThread();
+  public:
+    InternalThread()
+        : thread_() {
+    }
+    virtual ~InternalThread();
 
-  /** Returns true if the thread was successfully started. **/
-  bool StartInternalThread();
+    /** Returns true if the thread was successfully started. **/
+    bool StartInternalThread();
 
-  /** Will not return until the internal thread has exited. */
-  bool WaitForInternalThreadToExit();
+    /** Will not return until the internal thread has exited. */
+    bool WaitForInternalThreadToExit();
 
-  bool is_started() const;
+    bool is_started() const;
 
- protected:
-  /* Implement this method in your subclass
-      with the code you want your thread to run. */
-  virtual void InternalThreadEntry() {}
+  protected:
+    /* Implement this method in your subclass
+     with the code you want your thread to run. */
+    virtual void InternalThreadEntry() {
+    }
 
-  shared_ptr<boost::thread> thread_;
+    shared_ptr<boost::thread> thread_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index e2eba196..c346ede1 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -25,14 +25,14 @@ namespace caffe {
  */
 template <typename Dtype>
 class Layer {
- public:
-  /**
-   * You should not implement your own constructor. Any set up code should go
-   * to SetUp(), where the dimensions of the bottom blobs are provided to the
-   * layer.
-   */
-  explicit Layer(const LayerParameter& param)
-    : layer_param_(param) {
+  public:
+    /**
+     * You should not implement your own constructor. Any set up code should go
+     * to SetUp(), where the dimensions of the bottom blobs are provided to the
+     * layer.
+     */
+    explicit Layer(const LayerParameter& param)
+        : layer_param_(param) {
       // Set phase and copy blobs (if there are any).
       phase_ = param.phase();
       if (layer_param_.blobs_size() > 0) {
@@ -43,361 +43,384 @@ class Layer {
         }
       }
     }
-  virtual ~Layer() {}
-
-  /**
-   * @brief Implements common layer setup functionality.
-   *
-   * @param bottom the preshaped input blobs
-   * @param top
-   *     the allocated but unshaped output blobs, to be shaped by Reshape
-   *
-   * Checks that the number of bottom and top blobs is correct.
-   * Calls LayerSetUp to do special layer setup for individual layer types,
-   * followed by Reshape to set up sizes of top blobs and internal buffers.
-   * Sets up the loss weight multiplier blobs for any non-zero loss weights.
-   * This method may not be overridden.
-   */
-  void SetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    CheckBlobCounts(bottom, top);
-    LayerSetUp(bottom, top);
-    Reshape(bottom, top);
-    SetLossWeights(top);
-  }
-
-  /**
-   * @brief Does layer-specific setup: your layer should implement this function
-   *        as well as Reshape.
-   *
-   * @param bottom
-   *     the preshaped input blobs, whose data fields store the input data for
-   *     this layer
-   * @param top
-   *     the allocated but unshaped output blobs
-   *
-   * This method should do one-time layer specific setup. This includes reading
-   * and processing relevent parameters from the <code>layer_param_</code>.
-   * Setting up the shapes of top blobs and internal buffers should be done in
-   * <code>Reshape</code>, which will be called before the forward pass to
-   * adjust the top blob sizes.
-   */
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  /**
-   * @brief Adjust the shapes of top blobs and internal buffers to accomodate
-   *        the shapes of the bottom blobs.
-   *
-   * @param bottom the input blobs, with the requested input shapes
-   * @param top the top blobs, which should be reshaped as needed
-   *
-   * This method should reshape top blobs as needed according to the shapes
-   * of the bottom (input) blobs, as well as reshaping any internal buffers
-   * and making any other necessary adjustments so that the layer can
-   * accomodate the bottom blobs.
-   */
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) = 0;
-
-  /**
-   * @brief Given the bottom blobs, compute the top blobs and the loss.
-   *
-   * @param bottom
-   *     the input blobs, whose data fields store the input data for this layer
-   * @param top
-   *     the preshaped output blobs, whose data fields will store this layers'
-   *     outputs
-   * \return The total loss from the layer.
-   *
-   * The Forward wrapper calls the relevant device wrapper function
-   * (Forward_cpu or Forward_gpu) to compute the top blob values given the
-   * bottom blobs.  If the layer has any non-zero loss_weights, the wrapper
-   * then computes and returns the loss.
-   *
-   * Your layer should implement Forward_cpu and (optionally) Forward_gpu.
-   */
-  inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Given the top blob error gradients, compute the bottom blob error
-   *        gradients.
-   *
-   * @param top
-   *     the output blobs, whose diff fields store the gradient of the error
-   *     with respect to themselves
-   * @param propagate_down
-   *     a vector with equal length to bottom, with each index indicating
-   *     whether to propagate the error gradients down to the bottom blob at
-   *     the corresponding index
-   * @param bottom
-   *     the input blobs, whose diff fields will store the gradient of the error
-   *     with respect to themselves after Backward is run
-   *
-   * The Backward wrapper calls the relevant device wrapper function
-   * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the
-   * top blob diffs.
-   *
-   * Your layer should implement Backward_cpu and (optionally) Backward_gpu.
-   */
-  inline void Backward(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down,
-      const vector<Blob<Dtype>*>& bottom);
-
-  /**
-   * @brief Returns the vector of learnable parameter blobs.
-   */
-  vector<shared_ptr<Blob<Dtype> > >& blobs() {
-    return blobs_;
-  }
-
-  /**
-   * @brief Returns the layer parameter.
-   */
-  const LayerParameter& layer_param() const { return layer_param_; }
-
-  /**
-   * @brief Writes the layer parameter to a protocol buffer
-   */
-  virtual void ToProto(LayerParameter* param, bool write_diff = false);
-
-  /**
-   * @brief Returns the scalar loss associated with a top blob at a given index.
-   */
-  inline Dtype loss(const int top_index) const {
-    return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0);
-  }
-
-  /**
-   * @brief Sets the loss associated with a top blob at a given index.
-   */
-  inline void set_loss(const int top_index, const Dtype value) {
-    if (loss_.size() <= top_index) {
-      loss_.resize(top_index + 1, Dtype(0));
+    virtual ~Layer() {
     }
-    loss_[top_index] = value;
-  }
 
-  /**
-   * @brief Returns the layer type.
-   */
-  virtual inline const char* type() const { return ""; }
-
-  /**
-   * @brief Returns the exact number of bottom blobs required by the layer,
-   *        or -1 if no exact number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some exact number of bottom blobs.
-   */
-  virtual inline int ExactNumBottomBlobs() const { return -1; }
-  /**
-   * @brief Returns the minimum number of bottom blobs required by the layer,
-   *        or -1 if no minimum number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some minimum number of bottom blobs.
-   */
-  virtual inline int MinBottomBlobs() const { return -1; }
-  /**
-   * @brief Returns the maximum number of bottom blobs required by the layer,
-   *        or -1 if no maximum number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some maximum number of bottom blobs.
-   */
-  virtual inline int MaxBottomBlobs() const { return -1; }
-  /**
-   * @brief Returns the exact number of top blobs required by the layer,
-   *        or -1 if no exact number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some exact number of top blobs.
-   */
-  virtual inline int ExactNumTopBlobs() const { return -1; }
-  /**
-   * @brief Returns the minimum number of top blobs required by the layer,
-   *        or -1 if no minimum number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some minimum number of top blobs.
-   */
-  virtual inline int MinTopBlobs() const { return -1; }
-  /**
-   * @brief Returns the maximum number of top blobs required by the layer,
-   *        or -1 if no maximum number is required.
-   *
-   * This method should be overridden to return a non-negative value if your
-   * layer expects some maximum number of top blobs.
-   */
-  virtual inline int MaxTopBlobs() const { return -1; }
-  /**
-   * @brief Returns true if the layer requires an equal number of bottom and
-   *        top blobs.
-   *
-   * This method should be overridden to return true if your layer expects an
-   * equal number of bottom and top blobs.
-   */
-  virtual inline bool EqualNumBottomTopBlobs() const { return false; }
+    /**
+     * @brief Implements common layer setup functionality.
+     *
+     * @param bottom the preshaped input blobs
+     * @param top
+     *     the allocated but unshaped output blobs, to be shaped by Reshape
+     *
+     * Checks that the number of bottom and top blobs is correct.
+     * Calls LayerSetUp to do special layer setup for individual layer types,
+     * followed by Reshape to set up sizes of top blobs and internal buffers.
+     * Sets up the loss weight multiplier blobs for any non-zero loss weights.
+     * This method may not be overridden.
+     */
+    void SetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      CheckBlobCounts(bottom, top);
+      LayerSetUp(bottom, top);
+      Reshape(bottom, top);
+      SetLossWeights(top);
+    }
 
-  /**
-   * @brief Return whether "anonymous" top blobs are created automatically
-   *        by the layer.
-   *
-   * If this method returns true, Net::Init will create enough "anonymous" top
-   * blobs to fulfill the requirement specified by ExactNumTopBlobs() or
-   * MinTopBlobs().
-   */
-  virtual inline bool AutoTopBlobs() const { return false; }
+    /**
+     * @brief Does layer-specific setup: your layer should implement this function
+     *        as well as Reshape.
+     *
+     * @param bottom
+     *     the preshaped input blobs, whose data fields store the input data for
+     *     this layer
+     * @param top
+     *     the allocated but unshaped output blobs
+     *
+     * This method should do one-time layer specific setup. This includes reading
+     * and processing relevent parameters from the <code>layer_param_</code>.
+     * Setting up the shapes of top blobs and internal buffers should be done in
+     * <code>Reshape</code>, which will be called before the forward pass to
+     * adjust the top blob sizes.
+     */
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+    }
 
-  /**
-   * @brief Return whether to allow force_backward for a given bottom blob
-   *        index.
-   *
-   * If AllowForceBackward(i) == false, we will ignore the force_backward
-   * setting and backpropagate to blob i only if it needs gradient information
-   * (as is done when force_backward == false).
-   */
-  virtual inline bool AllowForceBackward(const int bottom_index) const {
-    return true;
-  }
+    /**
+     * @brief Adjust the shapes of top blobs and internal buffers to accomodate
+     *        the shapes of the bottom blobs.
+     *
+     * @param bottom the input blobs, with the requested input shapes
+     * @param top the top blobs, which should be reshaped as needed
+     *
+     * This method should reshape top blobs as needed according to the shapes
+     * of the bottom (input) blobs, as well as reshaping any internal buffers
+     * and making any other necessary adjustments so that the layer can
+     * accomodate the bottom blobs.
+     */
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) = 0;
+
+    /**
+     * @brief Given the bottom blobs, compute the top blobs and the loss.
+     *
+     * @param bottom
+     *     the input blobs, whose data fields store the input data for this layer
+     * @param top
+     *     the preshaped output blobs, whose data fields will store this layers'
+     *     outputs
+     * \return The total loss from the layer.
+     *
+     * The Forward wrapper calls the relevant device wrapper function
+     * (Forward_cpu or Forward_gpu) to compute the top blob values given the
+     * bottom blobs.  If the layer has any non-zero loss_weights, the wrapper
+     * then computes and returns the loss.
+     *
+     * Your layer should implement Forward_cpu and (optionally) Forward_gpu.
+     */
+    inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Given the top blob error gradients, compute the bottom blob error
+     *        gradients.
+     *
+     * @param top
+     *     the output blobs, whose diff fields store the gradient of the error
+     *     with respect to themselves
+     * @param propagate_down
+     *     a vector with equal length to bottom, with each index indicating
+     *     whether to propagate the error gradients down to the bottom blob at
+     *     the corresponding index
+     * @param bottom
+     *     the input blobs, whose diff fields will store the gradient of the error
+     *     with respect to themselves after Backward is run
+     *
+     * The Backward wrapper calls the relevant device wrapper function
+     * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the
+     * top blob diffs.
+     *
+     * Your layer should implement Backward_cpu and (optionally) Backward_gpu.
+     */
+    inline void Backward(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    /**
+     * @brief Returns the vector of learnable parameter blobs.
+     */
+    vector<shared_ptr<Blob<Dtype> > >& blobs() {
+      return blobs_;
+    }
 
-  /**
-   * @brief Specifies whether the layer should compute gradients w.r.t. a
-   *        parameter at a particular index given by param_id.
-   *
-   * You can safely ignore false values and always compute gradients
-   * for all parameters, but possibly with wasteful computation.
-   */
-  inline bool param_propagate_down(const int param_id) {
-    return (param_propagate_down_.size() > param_id) ?
-        param_propagate_down_[param_id] : false;
-  }
-  /**
-   * @brief Sets whether the layer should compute gradients w.r.t. a
-   *        parameter at a particular index given by param_id.
-   */
-  inline void set_param_propagate_down(const int param_id, const bool value) {
-    if (param_propagate_down_.size() <= param_id) {
-      param_propagate_down_.resize(param_id + 1, true);
+    /**
+     * @brief Returns the layer parameter.
+     */
+    const LayerParameter& layer_param() const {
+      return layer_param_;
     }
-    param_propagate_down_[param_id] = value;
-  }
 
+    /**
+     * @brief Writes the layer parameter to a protocol buffer
+     */
+    virtual void ToProto(LayerParameter* param, bool write_diff = false);
 
- protected:
-  /** The protobuf that stores the layer parameters */
-  LayerParameter layer_param_;
-  /** The phase: TRAIN or TEST */
-  Phase phase_;
-  /** The vector that stores the learnable parameters as a set of blobs. */
-  vector<shared_ptr<Blob<Dtype> > > blobs_;
-  /** Vector indicating whether to compute the diff of each param blob. */
-  vector<bool> param_propagate_down_;
+    /**
+     * @brief Returns the scalar loss associated with a top blob at a given index.
+     */
+    inline Dtype loss(const int top_index) const {
+      return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0);
+    }
 
-  /** The vector that indicates whether each top blob has a non-zero weight in
-   *  the objective function. */
-  vector<Dtype> loss_;
+    /**
+     * @brief Sets the loss associated with a top blob at a given index.
+     */
+    inline void set_loss(const int top_index, const Dtype value) {
+      if (loss_.size() <= top_index) {
+        loss_.resize(top_index + 1, Dtype(0));
+      }
+      loss_[top_index] = value;
+    }
 
-  /** @brief Using the CPU device, compute the layer output. */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) = 0;
-  /**
-   * @brief Using the GPU device, compute the layer output.
-   *        Fall back to Forward_cpu() if unavailable.
-   */
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    // LOG(WARNING) << "Using CPU code as backup.";
-    return Forward_cpu(bottom, top);
-  }
+    /**
+     * @brief Returns the layer type.
+     */
+    virtual inline const char* type() const {
+      return "";
+    }
 
-  /**
-   * @brief Using the CPU device, compute the gradients for any parameters and
-   *        for the bottom blobs if propagate_down is true.
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down,
-      const vector<Blob<Dtype>*>& bottom) = 0;
-  /**
-   * @brief Using the GPU device, compute the gradients for any parameters and
-   *        for the bottom blobs if propagate_down is true.
-   *        Fall back to Backward_cpu() if unavailable.
-   */
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down,
-      const vector<Blob<Dtype>*>& bottom) {
-    // LOG(WARNING) << "Using CPU code as backup.";
-    Backward_cpu(top, propagate_down, bottom);
-  }
+    /**
+     * @brief Returns the exact number of bottom blobs required by the layer,
+     *        or -1 if no exact number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some exact number of bottom blobs.
+     */
+    virtual inline int ExactNumBottomBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns the minimum number of bottom blobs required by the layer,
+     *        or -1 if no minimum number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some minimum number of bottom blobs.
+     */
+    virtual inline int MinBottomBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns the maximum number of bottom blobs required by the layer,
+     *        or -1 if no maximum number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some maximum number of bottom blobs.
+     */
+    virtual inline int MaxBottomBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns the exact number of top blobs required by the layer,
+     *        or -1 if no exact number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some exact number of top blobs.
+     */
+    virtual inline int ExactNumTopBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns the minimum number of top blobs required by the layer,
+     *        or -1 if no minimum number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some minimum number of top blobs.
+     */
+    virtual inline int MinTopBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns the maximum number of top blobs required by the layer,
+     *        or -1 if no maximum number is required.
+     *
+     * This method should be overridden to return a non-negative value if your
+     * layer expects some maximum number of top blobs.
+     */
+    virtual inline int MaxTopBlobs() const {
+      return -1;
+    }
+    /**
+     * @brief Returns true if the layer requires an equal number of bottom and
+     *        top blobs.
+     *
+     * This method should be overridden to return true if your layer expects an
+     * equal number of bottom and top blobs.
+     */
+    virtual inline bool EqualNumBottomTopBlobs() const {
+      return false;
+    }
 
-  /**
-   * Called by the parent Layer's SetUp to check that the number of bottom
-   * and top Blobs provided as input match the expected numbers specified by
-   * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions.
-   */
-  virtual void CheckBlobCounts(const vector<Blob<Dtype>*>& bottom,
-                               const vector<Blob<Dtype>*>& top) {
-    if (ExactNumBottomBlobs() >= 0) {
-      CHECK_EQ(ExactNumBottomBlobs(), bottom.size())
-          << type() << " Layer takes " << ExactNumBottomBlobs()
-          << " bottom blob(s) as input.";
+    /**
+     * @brief Return whether "anonymous" top blobs are created automatically
+     *        by the layer.
+     *
+     * If this method returns true, Net::Init will create enough "anonymous" top
+     * blobs to fulfill the requirement specified by ExactNumTopBlobs() or
+     * MinTopBlobs().
+     */
+    virtual inline bool AutoTopBlobs() const {
+      return false;
     }
-    if (MinBottomBlobs() >= 0) {
-      CHECK_LE(MinBottomBlobs(), bottom.size())
-          << type() << " Layer takes at least " << MinBottomBlobs()
-          << " bottom blob(s) as input.";
+
+    /**
+     * @brief Return whether to allow force_backward for a given bottom blob
+     *        index.
+     *
+     * If AllowForceBackward(i) == false, we will ignore the force_backward
+     * setting and backpropagate to blob i only if it needs gradient information
+     * (as is done when force_backward == false).
+     */
+    virtual inline bool AllowForceBackward(const int bottom_index) const {
+      return true;
     }
-    if (MaxBottomBlobs() >= 0) {
-      CHECK_GE(MaxBottomBlobs(), bottom.size())
-          << type() << " Layer takes at most " << MaxBottomBlobs()
-          << " bottom blob(s) as input.";
+
+    /**
+     * @brief Specifies whether the layer should compute gradients w.r.t. a
+     *        parameter at a particular index given by param_id.
+     *
+     * You can safely ignore false values and always compute gradients
+     * for all parameters, but possibly with wasteful computation.
+     */
+    inline bool param_propagate_down(const int param_id) {
+      return
+          (param_propagate_down_.size() > param_id) ?
+              param_propagate_down_[param_id] : false;
     }
-    if (ExactNumTopBlobs() >= 0) {
-      CHECK_EQ(ExactNumTopBlobs(), top.size())
-          << type() << " Layer produces " << ExactNumTopBlobs()
-          << " top blob(s) as output.";
+    /**
+     * @brief Sets whether the layer should compute gradients w.r.t. a
+     *        parameter at a particular index given by param_id.
+     */
+    inline void set_param_propagate_down(const int param_id, const bool value) {
+      if (param_propagate_down_.size() <= param_id) {
+        param_propagate_down_.resize(param_id + 1, true);
+      }
+      param_propagate_down_[param_id] = value;
     }
-    if (MinTopBlobs() >= 0) {
-      CHECK_LE(MinTopBlobs(), top.size())
-          << type() << " Layer produces at least " << MinTopBlobs()
-          << " top blob(s) as output.";
+
+  protected:
+    /** The protobuf that stores the layer parameters */
+    LayerParameter layer_param_;
+    /** The phase: TRAIN or TEST */
+    Phase phase_;
+    /** The vector that stores the learnable parameters as a set of blobs. */
+    vector<shared_ptr<Blob<Dtype> > > blobs_;
+    /** Vector indicating whether to compute the diff of each param blob. */
+    vector<bool> param_propagate_down_;
+
+    /** The vector that indicates whether each top blob has a non-zero weight in
+     *  the objective function. */
+    vector<Dtype> loss_;
+
+    /** @brief Using the CPU device, compute the layer output. */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) = 0;
+    /**
+     * @brief Using the GPU device, compute the layer output.
+     *        Fall back to Forward_cpu() if unavailable.
+     */
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      // LOG(WARNING) << "Using CPU code as backup.";
+      return Forward_cpu(bottom, top);
     }
-    if (MaxTopBlobs() >= 0) {
-      CHECK_GE(MaxTopBlobs(), top.size())
-          << type() << " Layer produces at most " << MaxTopBlobs()
-          << " top blob(s) as output.";
+
+    /**
+     * @brief Using the CPU device, compute the gradients for any parameters and
+     *        for the bottom blobs if propagate_down is true.
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) = 0;
+    /**
+     * @brief Using the GPU device, compute the gradients for any parameters and
+     *        for the bottom blobs if propagate_down is true.
+     *        Fall back to Backward_cpu() if unavailable.
+     */
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+      // LOG(WARNING) << "Using CPU code as backup.";
+      Backward_cpu(top, propagate_down, bottom);
     }
-    if (EqualNumBottomTopBlobs()) {
-      CHECK_EQ(bottom.size(), top.size())
-          << type() << " Layer produces one top blob as output for each "
-          << "bottom blob input.";
+
+    /**
+     * Called by the parent Layer's SetUp to check that the number of bottom
+     * and top Blobs provided as input match the expected numbers specified by
+     * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions.
+     */
+    virtual void CheckBlobCounts(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      if (ExactNumBottomBlobs() >= 0) {
+        CHECK_EQ(ExactNumBottomBlobs(), bottom.size()) << type()
+            << " Layer takes " << ExactNumBottomBlobs()
+            << " bottom blob(s) as input.";
+      }
+      if (MinBottomBlobs() >= 0) {
+        CHECK_LE(MinBottomBlobs(), bottom.size()) << type()
+            << " Layer takes at least " << MinBottomBlobs()
+            << " bottom blob(s) as input.";
+      }
+      if (MaxBottomBlobs() >= 0) {
+        CHECK_GE(MaxBottomBlobs(), bottom.size()) << type()
+            << " Layer takes at most " << MaxBottomBlobs()
+            << " bottom blob(s) as input.";
+      }
+      if (ExactNumTopBlobs() >= 0) {
+        CHECK_EQ(ExactNumTopBlobs(), top.size()) << type() << " Layer produces "
+            << ExactNumTopBlobs() << " top blob(s) as output.";
+      }
+      if (MinTopBlobs() >= 0) {
+        CHECK_LE(MinTopBlobs(), top.size()) << type()
+            << " Layer produces at least " << MinTopBlobs()
+            << " top blob(s) as output.";
+      }
+      if (MaxTopBlobs() >= 0) {
+        CHECK_GE(MaxTopBlobs(), top.size()) << type()
+            << " Layer produces at most " << MaxTopBlobs()
+            << " top blob(s) as output.";
+      }
+      if (EqualNumBottomTopBlobs()) {
+        CHECK_EQ(bottom.size(), top.size()) << type()
+            << " Layer produces one top blob as output for each "
+            << "bottom blob input.";
+      }
     }
-  }
 
-  /**
-   * Called by SetUp to initialize the weights associated with any top blobs in
-   * the loss function. Store non-zero loss weights in the diff blob.
-   */
-  inline void SetLossWeights(const vector<Blob<Dtype>*>& top) {
-    const int num_loss_weights = layer_param_.loss_weight_size();
-    if (num_loss_weights) {
-      CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
-          "unspecified or specified once per top blob.";
-      for (int top_id = 0; top_id < top.size(); ++top_id) {
-        const Dtype loss_weight = layer_param_.loss_weight(top_id);
-        if (loss_weight == Dtype(0)) { continue; }
-        this->set_loss(top_id, loss_weight);
-        const int count = top[top_id]->count();
-        Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff();
-        caffe_set(count, loss_weight, loss_multiplier);
+    /**
+     * Called by SetUp to initialize the weights associated with any top blobs in
+     * the loss function. Store non-zero loss weights in the diff blob.
+     */
+    inline void SetLossWeights(const vector<Blob<Dtype>*>& top) {
+      const int num_loss_weights = layer_param_.loss_weight_size();
+      if (num_loss_weights) {
+        CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
+            "unspecified or specified once per top blob.";
+        for (int top_id = 0; top_id < top.size(); ++top_id) {
+          const Dtype loss_weight = layer_param_.loss_weight(top_id);
+          if (loss_weight == Dtype(0)) {
+            continue;
+          }
+          this->set_loss(top_id, loss_weight);
+          const int count = top[top_id]->count();
+          Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff();
+          caffe_set(count, loss_weight, loss_multiplier);
+        }
       }
     }
-  }
 
-  DISABLE_COPY_AND_ASSIGN(Layer);
-};  // class Layer
+    DISABLE_COPY_AND_ASSIGN (Layer);
+};
+// class Layer
 
 // Forward and backward wrappers. You should implement the cpu and
 // gpu specific implementations instead, and should not change these
@@ -411,7 +434,9 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
   case Caffe::CPU:
     Forward_cpu(bottom, top);
     for (int top_id = 0; top_id < top.size(); ++top_id) {
-      if (!this->loss(top_id)) { continue; }
+      if (!this->loss(top_id)) {
+        continue;
+      }
       const int count = top[top_id]->count();
       const Dtype* data = top[top_id]->cpu_data();
       const Dtype* loss_weights = top[top_id]->cpu_diff();
@@ -422,7 +447,9 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     Forward_gpu(bottom, top);
 #ifndef CPU_ONLY
     for (int top_id = 0; top_id < top.size(); ++top_id) {
-      if (!this->loss(top_id)) { continue; }
+      if (!this->loss(top_id)) {
+        continue;
+      }
       const int count = top[top_id]->count();
       const Dtype* data = top[top_id]->gpu_data();
       const Dtype* loss_weights = top[top_id]->gpu_diff();
@@ -440,8 +467,7 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   switch (Caffe::mode()) {
   case Caffe::CPU:
     Backward_cpu(top, propagate_down, bottom);
diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp
index 2fcd9386..6da8d315 100644
--- a/include/caffe/layer_factory.hpp
+++ b/include/caffe/layer_factory.hpp
@@ -52,64 +52,63 @@ class Layer;
 
 template <typename Dtype>
 class LayerRegistry {
- public:
-  typedef shared_ptr<Layer<Dtype> > (*Creator)(const LayerParameter&);
-  typedef std::map<string, Creator> CreatorRegistry;
-
-  static CreatorRegistry& Registry() {
-    static CreatorRegistry* g_registry_ = new CreatorRegistry();
-    return *g_registry_;
-  }
-
-  // Adds a creator.
-  static void AddCreator(const string& type, Creator creator) {
-    CreatorRegistry& registry = Registry();
-    CHECK_EQ(registry.count(type), 0)
-        << "Layer type " << type << " already registered.";
-    registry[type] = creator;
-  }
-
-  // Get a layer using a LayerParameter.
-  static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
-    LOG(INFO) << "Creating layer " << param.name();
-    const string& type = param.type();
-    CreatorRegistry& registry = Registry();
-    CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
-        << " (known types: " << LayerTypeList() << ")";
-    return registry[type](param);
-  }
-
- private:
-  // Layer registry should never be instantiated - everything is done with its
-  // static variables.
-  LayerRegistry() {}
-
-  static string LayerTypeList() {
-    CreatorRegistry& registry = Registry();
-    string layer_types;
-    for (typename CreatorRegistry::iterator iter = registry.begin();
-         iter != registry.end(); ++iter) {
-      if (iter != registry.begin()) {
-        layer_types += ", ";
+  public:
+    typedef shared_ptr<Layer<Dtype> > (*Creator)(const LayerParameter&);
+    typedef std::map<string, Creator> CreatorRegistry;
+
+    static CreatorRegistry& Registry() {
+      static CreatorRegistry* g_registry_ = new CreatorRegistry();
+      return *g_registry_;
+    }
+
+    // Adds a creator.
+    static void AddCreator(const string& type, Creator creator) {
+      CreatorRegistry& registry = Registry();
+      CHECK_EQ(registry.count(type), 0) << "Layer type " << type
+          << " already registered.";
+      registry[type] = creator;
+    }
+
+    // Get a layer using a LayerParameter.
+    static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
+      LOG(INFO) << "Creating layer " << param.name();
+      const string& type = param.type();
+      CreatorRegistry& registry = Registry();
+      CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
+          << " (known types: " << LayerTypeList() << ")";
+      return registry[type](param);
+    }
+
+  private:
+    // Layer registry should never be instantiated - everything is done with its
+    // static variables.
+    LayerRegistry() {
+    }
+
+    static string LayerTypeList() {
+      CreatorRegistry& registry = Registry();
+      string layer_types;
+      for (typename CreatorRegistry::iterator iter = registry.begin();
+          iter != registry.end(); ++iter) {
+        if (iter != registry.begin()) {
+          layer_types += ", ";
+        }
+        layer_types += iter->first;
       }
-      layer_types += iter->first;
+      return layer_types;
     }
-    return layer_types;
-  }
 };
 
-
 template <typename Dtype>
 class LayerRegisterer {
- public:
-  LayerRegisterer(const string& type,
-                  shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
-    // LOG(INFO) << "Registering layer type: " << type;
-    LayerRegistry<Dtype>::AddCreator(type, creator);
-  }
+  public:
+    LayerRegisterer(const string& type,
+        shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
+      // LOG(INFO) << "Registering layer type: " << type;
+      LayerRegistry<Dtype>::AddCreator(type, creator);
+    }
 };
 
-
 #define REGISTER_LAYER_CREATOR(type, creator)                                  \
   static LayerRegisterer<float> g_creator_f_##type(#type, creator<float>);     \
   static LayerRegisterer<double> g_creator_d_##type(#type, creator<double>)    \
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 86c34241..431bd8ea 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -20,72 +20,81 @@ const float kLOG_THRESHOLD = 1e-20;
  *        classification task.
  */
 template <typename Dtype>
-class AccuracyLayer : public Layer<Dtype> {
- public:
-  /**
-   * @param param provides AccuracyParameter accuracy_param,
-   *     with AccuracyLayer options:
-   *   - top_k (\b optional, default 1).
-   *     Sets the maximum rank @f$ k @f$ at which a prediction is considered
-   *     correct.  For example, if @f$ k = 5 @f$, a prediction is counted
-   *     correct if the correct label is among the top 5 predicted labels.
-   */
-  explicit AccuracyLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Accuracy"; }
-  virtual inline int ExactNumBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$ x @f$, a Blob with values in
-   *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
-   *      the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted
-   *      label @f$ \hat{l}_n @f$ given by its maximal index:
-   *      @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels @f$ l @f$, an integer-valued Blob with values
-   *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
-   *      indicating the correct class label among the @f$ K @f$ classes
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      the computed accuracy: @f$
-   *        \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \}
-   *      @f$, where @f$
-   *      \delta\{\mathrm{condition}\} = \left\{
-   *         \begin{array}{lr}
-   *            1 & \mbox{if condition} \\
-   *            0 & \mbox{otherwise}
-   *         \end{array} \right.
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
+class AccuracyLayer: public Layer<Dtype> {
+  public:
+    /**
+     * @param param provides AccuracyParameter accuracy_param,
+     *     with AccuracyLayer options:
+     *   - top_k (\b optional, default 1).
+     *     Sets the maximum rank @f$ k @f$ at which a prediction is considered
+     *     correct.  For example, if @f$ k = 5 @f$, a prediction is counted
+     *     correct if the correct label is among the top 5 predicted labels.
+     */
+    explicit AccuracyLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
 
+    virtual inline const char* type() const {
+      return "Accuracy";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
 
-  /// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-    for (int i = 0; i < propagate_down.size(); ++i) {
-      if (propagate_down[i]) { NOT_IMPLEMENTED; }
+  protected:
+    /**
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$ x @f$, a Blob with values in
+     *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
+     *      the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted
+     *      label @f$ \hat{l}_n @f$ given by its maximal index:
+     *      @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels @f$ l @f$, an integer-valued Blob with values
+     *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
+     *      indicating the correct class label among the @f$ K @f$ classes
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      the computed accuracy: @f$
+     *        \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \}
+     *      @f$, where @f$
+     *      \delta\{\mathrm{condition}\} = \left\{
+     *         \begin{array}{lr}
+     *            1 & \mbox{if condition} \\
+   *            0 & \mbox{otherwise}
+     *         \end{array} \right.
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+      for (int i = 0; i < propagate_down.size(); ++i) {
+        if (propagate_down[i]) {
+          NOT_IMPLEMENTED;
+        }
+      }
     }
-  }
 
-  int label_axis_, outer_num_, inner_num_;
+    int label_axis_, outer_num_, inner_num_;
 
-  int top_k_;
+    int top_k_;
 
-  /// Whether to ignore instances with a certain label.
-  bool has_ignore_label_;
-  /// The label indicating that an instance should be ignored.
-  int ignore_label_;
+    /// Whether to ignore instances with a certain label.
+    bool has_ignore_label_;
+    /// The label indicating that an instance should be ignored.
+    int ignore_label_;
 };
 
 /**
@@ -97,32 +106,39 @@ class AccuracyLayer : public Layer<Dtype> {
  * -- the predictions.
  */
 template <typename Dtype>
-class LossLayer : public Layer<Dtype> {
- public:
-  explicit LossLayer(const LayerParameter& param)
-     : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
-
-  virtual inline int ExactNumBottomBlobs() const { return 2; }
-
-  /**
-   * @brief For convenience and backwards compatibility, instruct the Net to
-   *        automatically allocate a single top Blob for LossLayers, into which
-   *        they output their singleton loss, (even if the user didn't specify
-   *        one in the prototxt, etc.).
-   */
-  virtual inline bool AutoTopBlobs() const { return true; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-  /**
-   * We usually cannot backpropagate to the labels; ignore force_backward for
-   * these inputs.
-   */
-  virtual inline bool AllowForceBackward(const int bottom_index) const {
-    return bottom_index != 1;
-  }
+class LossLayer: public Layer<Dtype> {
+  public:
+    explicit LossLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline int ExactNumBottomBlobs() const {
+      return 2;
+    }
+
+    /**
+     * @brief For convenience and backwards compatibility, instruct the Net to
+     *        automatically allocate a single top Blob for LossLayers, into which
+     *        they output their singleton loss, (even if the user didn't specify
+     *        one in the prototxt, etc.).
+     */
+    virtual inline bool AutoTopBlobs() const {
+      return true;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+    /**
+     * We usually cannot backpropagate to the labels; ignore force_backward for
+     * these inputs.
+     */
+    virtual inline bool AllowForceBackward(const int bottom_index) const {
+      return bottom_index != 1;
+    }
 };
 
 /**
@@ -150,64 +166,69 @@ class LossLayer : public Layer<Dtype> {
  * This can be used to train siamese networks.
  */
 template <typename Dtype>
-class ContrastiveLossLayer : public LossLayer<Dtype> {
- public:
-  explicit ContrastiveLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param), diff_() {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline int ExactNumBottomBlobs() const { return 3; }
-  virtual inline const char* type() const { return "ContrastiveLoss"; }
-  /**
-   * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate
-   * to the first two inputs.
-   */
-  virtual inline bool AllowForceBackward(const int bottom_index) const {
-    return bottom_index != 2;
-  }
-
- protected:
-  /// @copydoc ContrastiveLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the Contrastive error gradient w.r.t. the inputs.
-   *
-   * Computes the gradients with respect to the two input vectors (bottom[0] and
-   * bottom[1]), but not the similarity label (bottom[2]).
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times 1 \times 1) @f$
-   *      the features @f$a@f$; Backward fills their diff with
-   *      gradients if propagate_down[0]
-   *   -# @f$ (N \times C \times 1 \times 1) @f$
-   *      the features @f$b@f$; Backward fills their diff with gradients if
-   *      propagate_down[1]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Blob<Dtype> diff_;  // cached for backward pass
-  Blob<Dtype> dist_sq_;  // cached for backward pass
-  Blob<Dtype> diff_sq_;  // tmp storage for gpu forward pass
-  Blob<Dtype> summer_vec_;  // tmp storage for gpu forward pass
+class ContrastiveLossLayer: public LossLayer<Dtype> {
+  public:
+    explicit ContrastiveLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param), diff_() {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline int ExactNumBottomBlobs() const {
+      return 3;
+    }
+    virtual inline const char* type() const {
+      return "ContrastiveLoss";
+    }
+    /**
+     * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate
+     * to the first two inputs.
+     */
+    virtual inline bool AllowForceBackward(const int bottom_index) const {
+      return bottom_index != 2;
+    }
+
+  protected:
+    /// @copydoc ContrastiveLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the Contrastive error gradient w.r.t. the inputs.
+     *
+     * Computes the gradients with respect to the two input vectors (bottom[0] and
+     * bottom[1]), but not the similarity label (bottom[2]).
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times 1 \times 1) @f$
+     *      the features @f$a@f$; Backward fills their diff with
+     *      gradients if propagate_down[0]
+     *   -# @f$ (N \times C \times 1 \times 1) @f$
+     *      the features @f$b@f$; Backward fills their diff with gradients if
+     *      propagate_down[1]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Blob<Dtype> diff_;  // cached for backward pass
+    Blob<Dtype> dist_sq_;  // cached for backward pass
+    Blob<Dtype> diff_sq_;  // tmp storage for gpu forward pass
+    Blob<Dtype> summer_vec_;  // tmp storage for gpu forward pass
 };
 
 /**
@@ -237,68 +258,71 @@ class ContrastiveLossLayer : public LossLayer<Dtype> {
  * linear least squares problems! We use it only as an instructive example.)
  */
 template <typename Dtype>
-class EuclideanLossLayer : public LossLayer<Dtype> {
- public:
-  explicit EuclideanLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param), diff_() {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "EuclideanLoss"; }
-  /**
-   * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate
-   * to both inputs -- override to return true and always allow force_backward.
-   */
-  virtual inline bool AllowForceBackward(const int bottom_index) const {
-    return true;
-  }
-
- protected:
-  /// @copydoc EuclideanLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the Euclidean error gradient w.r.t. the inputs.
-   *
-   * Unlike other children of LossLayer, EuclideanLossLayer \b can compute
-   * gradients with respect to the label inputs bottom[1] (but still only will
-   * if propagate_down[1] is set, due to being produced by learnable parameters
-   * or if force_backward is set). In fact, this layer is "commutative" -- the
-   * result is the same regardless of the order of the two bottoms.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$\hat{y}@f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial \hat{y}} =
-   *            \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n)
-   *      @f$ if propagate_down[0]
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the targets @f$y@f$; Backward fills their diff with gradients
-   *      @f$ \frac{\partial E}{\partial y} =
-   *          \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n)
-   *      @f$ if propagate_down[1]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Blob<Dtype> diff_;
+class EuclideanLossLayer: public LossLayer<Dtype> {
+  public:
+    explicit EuclideanLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param), diff_() {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "EuclideanLoss";
+    }
+    /**
+     * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate
+     * to both inputs -- override to return true and always allow force_backward.
+     */
+    virtual inline bool AllowForceBackward(const int bottom_index) const {
+      return true;
+    }
+
+  protected:
+    /// @copydoc EuclideanLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the Euclidean error gradient w.r.t. the inputs.
+     *
+     * Unlike other children of LossLayer, EuclideanLossLayer \b can compute
+     * gradients with respect to the label inputs bottom[1] (but still only will
+     * if propagate_down[1] is set, due to being produced by learnable parameters
+     * or if force_backward is set). In fact, this layer is "commutative" -- the
+     * result is the same regardless of the order of the two bottoms.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$\hat{y}@f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial \hat{y}} =
+     *            \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n)
+     *      @f$ if propagate_down[0]
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the targets @f$y@f$; Backward fills their diff with gradients
+     *      @f$ \frac{\partial E}{\partial y} =
+     *          \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n)
+     *      @f$ if propagate_down[1]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Blob<Dtype> diff_;
 };
 
 /**
@@ -345,47 +369,50 @@ class EuclideanLossLayer : public LossLayer<Dtype> {
  * HingeLossLayer).
  */
 template <typename Dtype>
-class HingeLossLayer : public LossLayer<Dtype> {
- public:
-  explicit HingeLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "HingeLoss"; }
-
- protected:
-  /// @copydoc HingeLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the hinge loss error gradient w.r.t. the predictions.
-   *
-   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-   * if propagate_down[1] is set.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   *      propagate_down[1] must be false as we can't compute gradients with
-   *      respect to the labels.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$t@f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial t} @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels -- ignored as we can't compute their error gradients
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+class HingeLossLayer: public LossLayer<Dtype> {
+  public:
+    explicit HingeLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "HingeLoss";
+    }
+
+  protected:
+    /// @copydoc HingeLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the hinge loss error gradient w.r.t. the predictions.
+     *
+     * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+     * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+     * if propagate_down[1] is set.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     *      propagate_down[1] must be false as we can't compute gradients with
+     *      respect to the labels.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$t@f$; Backward computes diff
+     *      @f$ \frac{\partial E}{\partial t} @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels -- ignored as we can't compute their error gradients
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -421,65 +448,74 @@ class HingeLossLayer : public LossLayer<Dtype> {
  *      @f$, where @f$ H_{l_n} @f$ denotes row @f$l_n@f$ of @f$H@f$.
  */
 template <typename Dtype>
-class InfogainLossLayer : public LossLayer<Dtype> {
- public:
-  explicit InfogainLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param), infogain_() {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should
-  // be the infogain matrix.  (Otherwise the infogain matrix is loaded from a
-  // file specified by LayerParameter.)
-  virtual inline int ExactNumBottomBlobs() const { return -1; }
-  virtual inline int MinBottomBlobs() const { return 2; }
-  virtual inline int MaxBottomBlobs() const { return 3; }
-
-  virtual inline const char* type() const { return "InfogainLoss"; }
-
- protected:
-  /// @copydoc InfogainLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the infogain loss error gradient w.r.t. the predictions.
-   *
-   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-   * if propagate_down[1] is set. (The same applies to the infogain matrix, if
-   * provided as bottom[2] rather than in the layer_param.)
-   *
-   * @param top output Blob vector (length 1), providing the error gradient
-   *      with respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   *      propagate_down[1] must be false as we can't compute gradients with
-   *      respect to the labels (similarly for propagate_down[2] and the
-   *      infogain matrix, if provided as bottom[2])
-   * @param bottom input Blob vector (length 2-3)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$ \hat{p} @f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels -- ignored as we can't compute their error gradients
-   *   -# @f$ (1 \times 1 \times K \times K) @f$
-   *      (\b optional) the information gain matrix -- ignored as its error
-   *      gradient computation is not implemented.
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Blob<Dtype> infogain_;
+class InfogainLossLayer: public LossLayer<Dtype> {
+  public:
+    explicit InfogainLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param), infogain_() {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should
+    // be the infogain matrix.  (Otherwise the infogain matrix is loaded from a
+    // file specified by LayerParameter.)
+    virtual inline int ExactNumBottomBlobs() const {
+      return -1;
+    }
+    virtual inline int MinBottomBlobs() const {
+      return 2;
+    }
+    virtual inline int MaxBottomBlobs() const {
+      return 3;
+    }
+
+    virtual inline const char* type() const {
+      return "InfogainLoss";
+    }
+
+  protected:
+    /// @copydoc InfogainLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the infogain loss error gradient w.r.t. the predictions.
+     *
+     * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+     * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+     * if propagate_down[1] is set. (The same applies to the infogain matrix, if
+     * provided as bottom[2] rather than in the layer_param.)
+     *
+     * @param top output Blob vector (length 1), providing the error gradient
+     *      with respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     *      propagate_down[1] must be false as we can't compute gradients with
+     *      respect to the labels (similarly for propagate_down[2] and the
+     *      infogain matrix, if provided as bottom[2])
+     * @param bottom input Blob vector (length 2-3)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$ \hat{p} @f$; Backward computes diff
+     *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels -- ignored as we can't compute their error gradients
+     *   -# @f$ (1 \times 1 \times K \times K) @f$
+     *      (\b optional) the information gain matrix -- ignored as its error
+     *      gradient computation is not implemented.
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Blob<Dtype> infogain_;
 };
 
 /**
@@ -512,50 +548,53 @@ class InfogainLossLayer : public LossLayer<Dtype> {
  *      @f$
  */
 template <typename Dtype>
-class MultinomialLogisticLossLayer : public LossLayer<Dtype> {
- public:
-  explicit MultinomialLogisticLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "MultinomialLogisticLoss"; }
-
- protected:
-  /// @copydoc MultinomialLogisticLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the multinomial logistic loss error gradient w.r.t. the
-   *        predictions.
-   *
-   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-   * if propagate_down[1] is set.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   *      propagate_down[1] must be false as we can't compute gradients with
-   *      respect to the labels.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$ \hat{p} @f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels -- ignored as we can't compute their error gradients
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+class MultinomialLogisticLossLayer: public LossLayer<Dtype> {
+  public:
+    explicit MultinomialLogisticLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "MultinomialLogisticLoss";
+    }
+
+  protected:
+    /// @copydoc MultinomialLogisticLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the multinomial logistic loss error gradient w.r.t. the
+     *        predictions.
+     *
+     * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+     * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+     * if propagate_down[1] is set.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     *      propagate_down[1] must be false as we can't compute gradients with
+     *      respect to the labels.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$ \hat{p} @f$; Backward computes diff
+     *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels -- ignored as we can't compute their error gradients
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -588,67 +627,69 @@ class MultinomialLogisticLossLayer : public LossLayer<Dtype> {
  *      @f$
  */
 template <typename Dtype>
-class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
- public:
-  explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param),
-          sigmoid_layer_(new SigmoidLayer<Dtype>(param)),
-          sigmoid_output_(new Blob<Dtype>()) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "SigmoidCrossEntropyLoss"; }
-
- protected:
-  /// @copydoc SigmoidCrossEntropyLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
-   *        predictions.
-   *
-   * Gradients cannot be computed with respect to the target inputs (bottom[1]),
-   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-   * if propagate_down[1] is set.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   *      propagate_down[1] must be false as gradient computation with respect
-   *      to the targets is not implemented.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$x@f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial x} =
-   *          \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n)
-   *      @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels -- ignored as we can't compute their error gradients
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  /// The internal SigmoidLayer used to map predictions to probabilities.
-  shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
-  /// sigmoid_output stores the output of the SigmoidLayer.
-  shared_ptr<Blob<Dtype> > sigmoid_output_;
-  /// bottom vector holder to call the underlying SigmoidLayer::Forward
-  vector<Blob<Dtype>*> sigmoid_bottom_vec_;
-  /// top vector holder to call the underlying SigmoidLayer::Forward
-  vector<Blob<Dtype>*> sigmoid_top_vec_;
+class SigmoidCrossEntropyLossLayer: public LossLayer<Dtype> {
+  public:
+    explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param), sigmoid_layer_(
+            new SigmoidLayer<Dtype>(param)), sigmoid_output_(new Blob<Dtype>()) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "SigmoidCrossEntropyLoss";
+    }
+
+  protected:
+    /// @copydoc SigmoidCrossEntropyLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
+     *        predictions.
+     *
+     * Gradients cannot be computed with respect to the target inputs (bottom[1]),
+     * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+     * if propagate_down[1] is set.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     *      propagate_down[1] must be false as gradient computation with respect
+     *      to the targets is not implemented.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$x@f$; Backward computes diff
+     *      @f$ \frac{\partial E}{\partial x} =
+     *          \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n)
+     *      @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels -- ignored as we can't compute their error gradients
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    /// The internal SigmoidLayer used to map predictions to probabilities.
+    shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
+    /// sigmoid_output stores the output of the SigmoidLayer.
+    shared_ptr<Blob<Dtype> > sigmoid_output_;
+    /// bottom vector holder to call the underlying SigmoidLayer::Forward
+    vector<Blob<Dtype>*> sigmoid_bottom_vec_;
+    /// top vector holder to call the underlying SigmoidLayer::Forward
+    vector<Blob<Dtype>*> sigmoid_top_vec_;
 };
 
 // Forward declare SoftmaxLayer for use in SoftmaxWithLossLayer.
@@ -668,6 +709,7 @@ template <typename Dtype> class SoftmaxLayer;
  *   -# @f$ (N \times C \times H \times W) @f$
  *      the predictions @f$ x @f$, a Blob with values in
  *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
+ ss
  *      the @f$ K = CHW @f$ classes. This layer maps these scores to a
  *      probability distribution over classes using the softmax function
  *      @f$ \hat{p}_{nk} = \exp(x_{nk}) /
@@ -683,84 +725,100 @@ template <typename Dtype> class SoftmaxLayer;
  *      @f$, for softmax output class probabilites @f$ \hat{p} @f$
  */
 template <typename Dtype>
-class SoftmaxWithLossLayer : public LossLayer<Dtype> {
- public:
-   /**
-    * @param param provides LossParameter loss_param, with options:
-    *  - ignore_label (optional)
-    *    Specify a label value that should be ignored when computing the loss.
-    *  - normalize (optional, default true)
-    *    If true, the loss is normalized by the number of (nonignored) labels
-    *    present; otherwise the loss is simply summed over spatial locations.
-    */
-  explicit SoftmaxWithLossLayer(const LayerParameter& param)
-      : LossLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "SoftmaxWithLoss"; }
-  virtual inline int ExactNumTopBlobs() const { return -1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  virtual inline int MaxTopBlobs() const { return 2; }
-
- protected:
-  /// @copydoc SoftmaxWithLossLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  /**
-   * @brief Computes the softmax loss error gradient w.r.t. the predictions.
-   *
-   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
-   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
-   * if propagate_down[1] is set.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
-   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
-   *      as @f$ \lambda @f$ is the coefficient of this layer's output
-   *      @f$\ell_i@f$ in the overall Net loss
-   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
-   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
-   *      (*Assuming that this top Blob is not used as a bottom (input) by any
-   *      other layer of the Net.)
-   * @param propagate_down see Layer::Backward.
-   *      propagate_down[1] must be false as we can't compute gradients with
-   *      respect to the labels.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$ x @f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial x} @f$
-   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
-   *      the labels -- ignored as we can't compute their error gradients
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-
-  /// The internal SoftmaxLayer used to map predictions to a distribution.
-  shared_ptr<Layer<Dtype> > softmax_layer_;
-  /// prob stores the output probability predictions from the SoftmaxLayer.
-  Blob<Dtype> prob_;
-  /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
-  vector<Blob<Dtype>*> softmax_bottom_vec_;
-  /// top vector holder used in call to the underlying SoftmaxLayer::Forward
-  vector<Blob<Dtype>*> softmax_top_vec_;
-  /// Whether to ignore instances with a certain label.
-  bool has_ignore_label_;
-  /// The label indicating that an instance should be ignored.
-  int ignore_label_;
-  /// Whether to normalize the loss by the total number of values present
-  /// (otherwise just by the batch size).
-  bool normalize_;
-
-  int softmax_axis_, outer_num_, inner_num_;
+class SoftmaxWithLossLayer: public LossLayer<Dtype> {
+  public:
+    /**
+     * @param param provides LossParameter loss_param, with options:
+     *  - ignore_label (optional)
+     *    Specify a label value that should be ignored when computing the loss.
+     *  - normalize (optional, default true)
+     *    If true, the loss is normalized by the number of (nonignored) labels
+     *    present; otherwise the loss is simply summed over spatial locations.
+     */
+    explicit SoftmaxWithLossLayer(const LayerParameter& param)
+        : LossLayer<Dtype>(param) {
+    }
+    ~SoftmaxWithLossLayer();
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "SoftmaxWithLoss";
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return -1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+    virtual inline int MaxTopBlobs() const {
+      return 2;
+    }
+
+  protected:
+    /// @copydoc SoftmaxWithLossLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    /**
+     * @brief Computes the softmax loss error gradient w.r.t. the predictions.
+     *
+     * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+     * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+     * if propagate_down[1] is set.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+     *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+     *      as @f$ \lambda @f$ is the coefficient of this layer's output
+     *      @f$\ell_i@f$ in the overall Net loss
+     *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+     *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+     *      (*Assuming that this top Blob is not used as a bottom (input) by any
+     *      other layer of the Net.)
+     * @param propagate_down see Layer::Backward.
+     *      propagate_down[1] must be false as we can't compute gradients with
+     *      respect to the labels.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the predictions @f$ x @f$; Backward computes diff
+     *      @f$ \frac{\partial E}{\partial x} @f$
+     *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+     *      the labels -- ignored as we can't compute their error gradients
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    void ocl_setup();
+
+    /// The internal SoftmaxLayer used to map predictions to a distribution.
+    shared_ptr<Layer<Dtype> > softmax_layer_;
+    /// prob stores the output probability predictions from the SoftmaxLayer.
+    Blob<Dtype> prob_;
+    /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
+    vector<Blob<Dtype>*> softmax_bottom_vec_;
+    /// top vector holder used in call to the underlying SoftmaxLayer::Forward
+    vector<Blob<Dtype>*> softmax_top_vec_;
+    /// Whether to ignore instances with a certain label.
+    bool has_ignore_label_;
+    /// The label indicating that an instance should be ignored.
+    int ignore_label_;
+    /// Whether to normalize the loss by the total number of values present
+    /// (otherwise just by the batch size).
+    bool normalize_;
+
+    int softmax_axis_, outer_num_, inner_num_;
+
+  protected:
+    cl_kernel diff_kernel, scal_kernel, softmax_kernel;
+    cl_mem d_loss;
+    cl_kernel softmax_loss_fp_kernel;
+    cl_kernel softmax_loss_bp_kernel;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 5665df1e..bbd61b88 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -22,247 +22,266 @@ namespace caffe {
  */
 template <typename Dtype>
 class Net {
- public:
-  explicit Net(const NetParameter& param);
-  explicit Net(const string& param_file, Phase phase);
-  virtual ~Net() {}
+  public:
+    explicit Net(const NetParameter& param);
+    explicit Net(const string& param_file, Phase phase);
+    virtual ~Net() {
+    }
 
-  /// @brief Initialize a network with a NetParameter.
-  void Init(const NetParameter& param);
+    /// @brief Initialize a network with a NetParameter.
+    void Init(const NetParameter& param);
 
-  /**
-   * @brief Run Forward with the input Blob%s already fed separately.
-   *
-   * You can get the input blobs using input_blobs().
-   */
-  const vector<Blob<Dtype>*>& ForwardPrefilled(Dtype* loss = NULL);
+    /**
+     * @brief Run Forward with the input Blob%s already fed separately.
+     *
+     * You can get the input blobs using input_blobs().
+     */
+    const vector<Blob<Dtype>*>& ForwardPrefilled(Dtype* loss = NULL);
 
-  /**
-   * The From and To variants of Forward and Backward operate on the
-   * (topological) ordering by which the net is specified. For general DAG
-   * networks, note that (1) computing from one layer to another might entail
-   * extra computation on unrelated branches, and (2) computation starting in
-   * the middle may be incorrect if all of the layers of a fan-in are not
-   * included.
-   */
-  Dtype ForwardFromTo(int start, int end);
-  Dtype ForwardFrom(int start);
-  Dtype ForwardTo(int end);
-  /// @brief Run forward using a set of bottom blobs, and return the result.
-  const vector<Blob<Dtype>*>& Forward(const vector<Blob<Dtype>* > & bottom,
-      Dtype* loss = NULL);
-  /**
-   * @brief Run forward using a serialized BlobProtoVector and return the
-   *        result as a serialized BlobProtoVector
-   */
-  string Forward(const string& input_blob_protos, Dtype* loss = NULL);
+    /**
+     * The From and To variants of Forward and Backward operate on the
+     * (topological) ordering by which the net is specified. For general DAG
+     * networks, note that (1) computing from one layer to another might entail
+     * extra computation on unrelated branches, and (2) computation starting in
+     * the middle may be incorrect if all of the layers of a fan-in are not
+     * included.
+     */
+    Dtype ForwardFromTo(int start, int end);
+    Dtype ForwardFrom(int start);
+    Dtype ForwardTo(int end);
+    /// @brief Run forward using a set of bottom blobs, and return the result.
+    const vector<Blob<Dtype>*>& Forward(const vector<Blob<Dtype>*> & bottom,
+        Dtype* loss = NULL);
+    /**
+     * @brief Run forward using a serialized BlobProtoVector and return the
+     *        result as a serialized BlobProtoVector
+     */
+    string Forward(const string& input_blob_protos, Dtype* loss = NULL);
 
-  /**
-   * The network backward should take no input and output, since it solely
-   * computes the gradient w.r.t the parameters, and the data has already been
-   * provided during the forward pass.
-   */
-  void Backward();
-  void BackwardFromTo(int start, int end);
-  void BackwardFrom(int start);
-  void BackwardTo(int end);
+    /**
+     * The network backward should take no input and output, since it solely
+     * computes the gradient w.r.t the parameters, and the data has already been
+     * provided during the forward pass.
+     */
+    void Backward();
+    void BackwardFromTo(int start, int end);
+    void BackwardFrom(int start);
+    void BackwardTo(int end);
 
-  /**
-   * @brief Reshape all layers from bottom to top.
-   *
-   * This is useful to propagate changes to layer sizes without running
-   * a forward pass, e.g. to compute output feature size.
-   */
-  void Reshape();
+    /**
+     * @brief Reshape all layers from bottom to top.
+     *
+     * This is useful to propagate changes to layer sizes without running
+     * a forward pass, e.g. to compute output feature size.
+     */
+    void Reshape();
 
-  Dtype ForwardBackward(const vector<Blob<Dtype>* > & bottom) {
-    Dtype loss;
-    Forward(bottom, &loss);
-    Backward();
-    return loss;
-  }
+    Dtype ForwardBackward(const vector<Blob<Dtype>*> & bottom) {
+      Dtype loss;
+      Forward(bottom, &loss);
+      Backward();
+      return loss;
+    }
 
-  /// @brief Updates the network weights based on the diff values computed.
-  void Update();
+    /// @brief Updates the network weights based on the diff values computed.
+    void Update();
 
-  /**
-   * @brief For an already initialized net, implicitly copies (i.e., using no
-   *        additional memory) the pre-trained layers from another Net.
-   */
-  void ShareTrainedLayersWith(const Net* other);
-  // For an already initialized net, CopyTrainedLayersFrom() copies the already
-  // trained layers from another net parameter instance.
-  /**
-   * @brief For an already initialized net, copies the pre-trained layers from
-   *        another Net.
-   */
-  void CopyTrainedLayersFrom(const NetParameter& param);
-  void CopyTrainedLayersFrom(const string trained_filename);
-  /// @brief Writes the net to a proto.
-  void ToProto(NetParameter* param, bool write_diff = false) const;
+    /**
+     * @brief For an already initialized net, implicitly copies (i.e., using no
+     *        additional memory) the pre-trained layers from another Net.
+     */
+    void ShareTrainedLayersWith(const Net* other);
+    // For an already initialized net, CopyTrainedLayersFrom() copies the already
+    // trained layers from another net parameter instance.
+    /**
+     * @brief For an already initialized net, copies the pre-trained layers from
+     *        another Net.
+     */
+    void CopyTrainedLayersFrom(const NetParameter& param);
+    void CopyTrainedLayersFrom(const string trained_filename);
+    /// @brief Writes the net to a proto.
+    void ToProto(NetParameter* param, bool write_diff = false) const;
 
-  /// @brief returns the network name.
-  inline const string& name() const { return name_; }
-  /// @brief returns the layer names
-  inline const vector<string>& layer_names() const { return layer_names_; }
-  /// @brief returns the blob names
-  inline const vector<string>& blob_names() const { return blob_names_; }
-  /// @brief returns the blobs
-  inline const vector<shared_ptr<Blob<Dtype> > >& blobs() const {
-    return blobs_;
-  }
-  /// @brief returns the layers
-  inline const vector<shared_ptr<Layer<Dtype> > >& layers() const {
-    return layers_;
-  }
-  /// @brief returns the phase: TRAIN or TEST
-  inline Phase phase() const { return phase_; }
-  /**
-   * @brief returns the bottom vecs for each layer -- usually you won't
-   *        need this unless you do per-layer checks such as gradients.
-   */
-  inline const vector<vector<Blob<Dtype>*> >& bottom_vecs() const {
-    return bottom_vecs_;
-  }
-  /**
-   * @brief returns the top vecs for each layer -- usually you won't
-   *        need this unless you do per-layer checks such as gradients.
-   */
-  inline const vector<vector<Blob<Dtype>*> >& top_vecs() const {
-    return top_vecs_;
-  }
-  inline const vector<vector<bool> >& bottom_need_backward() const {
-    return bottom_need_backward_;
-  }
-  inline const vector<Dtype>& blob_loss_weights() const {
-    return blob_loss_weights_;
-  }
-  inline const vector<bool>& layer_need_backward() const {
-    return layer_need_backward_;
-  }
-  /// @brief returns the parameters
-  inline const vector<shared_ptr<Blob<Dtype> > >& params() const {
-    return params_;
-  }
-  /// @brief returns the parameter learning rate multipliers
-  inline const vector<float>& params_lr() const { return params_lr_; }
-  inline const vector<float>& params_weight_decay() const {
-    return params_weight_decay_;
-  }
-  const map<string, int>& param_names_index() const {
-    return param_names_index_;
-  }
-  inline const vector<int>& param_owners() const { return param_owners_; }
-  /// @brief Input and output blob numbers
-  inline int num_inputs() const { return net_input_blobs_.size(); }
-  inline int num_outputs() const { return net_output_blobs_.size(); }
-  inline const vector<Blob<Dtype>*>& input_blobs() const {
-    return net_input_blobs_;
-  }
-  inline const vector<Blob<Dtype>*>& output_blobs() const {
-    return net_output_blobs_;
-  }
-  inline const vector<int>& input_blob_indices() const {
-    return net_input_blob_indices_;
-  }
-  inline const vector<int>& output_blob_indices() const {
-    return net_output_blob_indices_;
-  }
-  bool has_blob(const string& blob_name) const;
-  const shared_ptr<Blob<Dtype> > blob_by_name(const string& blob_name) const;
-  bool has_layer(const string& layer_name) const;
-  const shared_ptr<Layer<Dtype> > layer_by_name(const string& layer_name) const;
+    /// @brief returns the network name.
+    inline const string& name() const {
+      return name_;
+    }
+    /// @brief returns the layer names
+    inline const vector<string>& layer_names() const {
+      return layer_names_;
+    }
+    /// @brief returns the blob names
+    inline const vector<string>& blob_names() const {
+      return blob_names_;
+    }
+    /// @brief returns the blobs
+    inline const vector<shared_ptr<Blob<Dtype> > >& blobs() const {
+      return blobs_;
+    }
+    /// @brief returns the layers
+    inline const vector<shared_ptr<Layer<Dtype> > >& layers() const {
+      return layers_;
+    }
+    /// @brief returns the phase: TRAIN or TEST
+    inline Phase phase() const {
+      return phase_;
+    }
+    /**
+     * @brief returns the bottom vecs for each layer -- usually you won't
+     *        need this unless you do per-layer checks such as gradients.
+     */
+    inline const vector<vector<Blob<Dtype>*> >& bottom_vecs() const {
+      return bottom_vecs_;
+    }
+    /**
+     * @brief returns the top vecs for each layer -- usually you won't
+     *        need this unless you do per-layer checks such as gradients.
+     */
+    inline const vector<vector<Blob<Dtype>*> >& top_vecs() const {
+      return top_vecs_;
+    }
+    inline const vector<vector<bool> >& bottom_need_backward() const {
+      return bottom_need_backward_;
+    }
+    inline const vector<Dtype>& blob_loss_weights() const {
+      return blob_loss_weights_;
+    }
+    inline const vector<bool>& layer_need_backward() const {
+      return layer_need_backward_;
+    }
+    /// @brief returns the parameters
+    inline const vector<shared_ptr<Blob<Dtype> > >& params() const {
+      return params_;
+    }
+    /// @brief returns the parameter learning rate multipliers
+    inline const vector<float>& params_lr() const {
+      return params_lr_;
+    }
+    inline const vector<float>& params_weight_decay() const {
+      return params_weight_decay_;
+    }
+    const map<string, int>& param_names_index() const {
+      return param_names_index_;
+    }
+    inline const vector<int>& param_owners() const {
+      return param_owners_;
+    }
+    /// @brief Input and output blob numbers
+    inline int num_inputs() const {
+      return net_input_blobs_.size();
+    }
+    inline int num_outputs() const {
+      return net_output_blobs_.size();
+    }
+    inline const vector<Blob<Dtype>*>& input_blobs() const {
+      return net_input_blobs_;
+    }
+    inline const vector<Blob<Dtype>*>& output_blobs() const {
+      return net_output_blobs_;
+    }
+    inline const vector<int>& input_blob_indices() const {
+      return net_input_blob_indices_;
+    }
+    inline const vector<int>& output_blob_indices() const {
+      return net_output_blob_indices_;
+    }
+    bool has_blob(const string& blob_name) const;
+    const shared_ptr<Blob<Dtype> > blob_by_name(const string& blob_name) const;
+    bool has_layer(const string& layer_name) const;
+    const shared_ptr<Layer<Dtype> > layer_by_name(
+        const string& layer_name) const;
 
-  void set_debug_info(const bool value) { debug_info_ = value; }
+    void set_debug_info(const bool value) {
+      debug_info_ = value;
+    }
 
-  // Helpers for Init.
-  /**
-   * @brief Remove layers that the user specified should be excluded given the current
-   *        phase, level, and stage.
-   */
-  static void FilterNet(const NetParameter& param,
-      NetParameter* param_filtered);
-  /// @brief return whether NetState state meets NetStateRule rule
-  static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
-      const string& layer_name);
+    // Helpers for Init.
+    /**
+     * @brief Remove layers that the user specified should be excluded given the current
+     *        phase, level, and stage.
+     */
+    static void FilterNet(const NetParameter& param,
+        NetParameter* param_filtered);
+    /// @brief return whether NetState state meets NetStateRule rule
+    static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
+        const string& layer_name);
 
- protected:
-  // Helpers for Init.
-  /// @brief Append a new input or top blob to the net.
-  void AppendTop(const NetParameter& param, const int layer_id,
-                 const int top_id, set<string>* available_blobs,
-                 map<string, int>* blob_name_to_idx);
-  /// @brief Append a new bottom blob to the net.
-  int AppendBottom(const NetParameter& param, const int layer_id,
-                   const int bottom_id, set<string>* available_blobs,
-                   map<string, int>* blob_name_to_idx);
-  /// @brief Append a new parameter blob to the net.
-  void AppendParam(const NetParameter& param, const int layer_id,
-                   const int param_id);
+  protected:
+    // Helpers for Init.
+    /// @brief Append a new input or top blob to the net.
+    void AppendTop(const NetParameter& param, const int layer_id,
+        const int top_id, set<string>* available_blobs,
+        map<string, int>* blob_name_to_idx);
+    /// @brief Append a new bottom blob to the net.
+    int AppendBottom(const NetParameter& param, const int layer_id,
+        const int bottom_id, set<string>* available_blobs,
+        map<string, int>* blob_name_to_idx);
+    /// @brief Append a new parameter blob to the net.
+    void AppendParam(const NetParameter& param, const int layer_id,
+        const int param_id);
 
-  /// @brief Helper for displaying debug info in Forward about input Blobs.
-  void InputDebugInfo(const int layer_id);
-  /// @brief Helper for displaying debug info in Forward.
-  void ForwardDebugInfo(const int layer_id);
-  /// @brief Helper for displaying debug info in Backward.
-  void BackwardDebugInfo(const int layer_id);
-  /// @brief Helper for displaying debug info in Update.
-  void UpdateDebugInfo(const int param_id);
+    /// @brief Helper for displaying debug info in Forward about input Blobs.
+    void InputDebugInfo(const int layer_id);
+    /// @brief Helper for displaying debug info in Forward.
+    void ForwardDebugInfo(const int layer_id);
+    /// @brief Helper for displaying debug info in Backward.
+    void BackwardDebugInfo(const int layer_id);
+    /// @brief Helper for displaying debug info in Update.
+    void UpdateDebugInfo(const int param_id);
 
-  /// @brief Get misc parameters, e.g. the LR multiplier and weight decay.
-  void GetLearningRateAndWeightDecay();
+    /// @brief Get misc parameters, e.g. the LR multiplier and weight decay.
+    void GetLearningRateAndWeightDecay();
 
-  /// @brief The network name
-  string name_;
-  /// @brief The phase: TRAIN or TEST
-  Phase phase_;
-  /// @brief Individual layers in the net
-  vector<shared_ptr<Layer<Dtype> > > layers_;
-  vector<string> layer_names_;
-  map<string, int> layer_names_index_;
-  vector<bool> layer_need_backward_;
-  /// @brief the blobs storing intermediate results between the layer.
-  vector<shared_ptr<Blob<Dtype> > > blobs_;
-  vector<string> blob_names_;
-  map<string, int> blob_names_index_;
-  vector<bool> blob_need_backward_;
-  /// bottom_vecs stores the vectors containing the input for each layer.
-  /// They don't actually host the blobs (blobs_ does), so we simply store
-  /// pointers.
-  vector<vector<Blob<Dtype>*> > bottom_vecs_;
-  vector<vector<int> > bottom_id_vecs_;
-  vector<vector<bool> > bottom_need_backward_;
-  /// top_vecs stores the vectors containing the output for each layer
-  vector<vector<Blob<Dtype>*> > top_vecs_;
-  vector<vector<int> > top_id_vecs_;
-  /// Vector of weight in the loss (or objective) function of each net blob,
-  /// indexed by blob_id.
-  vector<Dtype> blob_loss_weights_;
-  vector<vector<int> > param_id_vecs_;
-  vector<int> param_owners_;
-  vector<string> param_display_names_;
-  vector<pair<int, int> > param_layer_indices_;
-  map<string, int> param_names_index_;
-  /// blob indices for the input and the output of the net
-  vector<int> net_input_blob_indices_;
-  vector<int> net_output_blob_indices_;
-  vector<Blob<Dtype>*> net_input_blobs_;
-  vector<Blob<Dtype>*> net_output_blobs_;
-  /// The parameters in the network.
-  vector<shared_ptr<Blob<Dtype> > > params_;
-  /// the learning rate multipliers
-  vector<float> params_lr_;
-  /// the weight decay multipliers
-  vector<float> params_weight_decay_;
-  /// The bytes of memory used by this net
-  size_t memory_used_;
-  /// Whether to compute and display debug info for the net.
-  bool debug_info_;
+    /// @brief The network name
+    string name_;
+    /// @brief The phase: TRAIN or TEST
+    Phase phase_;
+    /// @brief Individual layers in the net
+    vector<shared_ptr<Layer<Dtype> > > layers_;
+    vector<string> layer_names_;
+    map<string, int> layer_names_index_;
+    vector<bool> layer_need_backward_;
+    /// @brief the blobs storing intermediate results between the layer.
+    vector<shared_ptr<Blob<Dtype> > > blobs_;
+    vector<string> blob_names_;
+    map<string, int> blob_names_index_;
+    vector<bool> blob_need_backward_;
+    /// bottom_vecs stores the vectors containing the input for each layer.
+    /// They don't actually host the blobs (blobs_ does), so we simply store
+    /// pointers.
+    vector<vector<Blob<Dtype>*> > bottom_vecs_;
+    vector<vector<int> > bottom_id_vecs_;
+    vector<vector<bool> > bottom_need_backward_;
+    /// top_vecs stores the vectors containing the output for each layer
+    vector<vector<Blob<Dtype>*> > top_vecs_;
+    vector<vector<int> > top_id_vecs_;
+    /// Vector of weight in the loss (or objective) function of each net blob,
+    /// indexed by blob_id.
+    vector<Dtype> blob_loss_weights_;
+    vector<vector<int> > param_id_vecs_;
+    vector<int> param_owners_;
+    vector<string> param_display_names_;
+    vector<pair<int, int> > param_layer_indices_;
+    map<string, int> param_names_index_;
+    /// blob indices for the input and the output of the net
+    vector<int> net_input_blob_indices_;
+    vector<int> net_output_blob_indices_;
+    vector<Blob<Dtype>*> net_input_blobs_;
+    vector<Blob<Dtype>*> net_output_blobs_;
+    /// The parameters in the network.
+    vector<shared_ptr<Blob<Dtype> > > params_;
+    /// the learning rate multipliers
+    vector<float> params_lr_;
+    /// the weight decay multipliers
+    vector<float> params_weight_decay_;
+    /// The bytes of memory used by this net
+    size_t memory_used_;
+    /// Whether to compute and display debug info for the net.
+    bool debug_info_;
 
-  DISABLE_COPY_AND_ASSIGN(Net);
+    DISABLE_COPY_AND_ASSIGN (Net);
 };
 
-
 }  // namespace caffe
 
 #endif  // CAFFE_NET_HPP_
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index c2e0774a..2a240a5f 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -9,6 +9,7 @@
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/proto/caffe.pb.h"
+#include "caffe/util/ocl_wrapper.hpp"
 
 #define HDF5_DATA_DATASET_NAME "data"
 #define HDF5_DATA_LABEL_NAME "label"
@@ -22,15 +23,20 @@ namespace caffe {
  *        element.
  */
 template <typename Dtype>
-class NeuronLayer : public Layer<Dtype> {
- public:
-  explicit NeuronLayer(const LayerParameter& param)
-     : Layer<Dtype>(param) {}
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
+class NeuronLayer: public Layer<Dtype> {
+  public:
+    explicit NeuronLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
 };
 
 /**
@@ -44,45 +50,52 @@ class NeuronLayer : public Layer<Dtype> {
  *      the computed outputs @f$ y = |x| @f$
  */
 template <typename Dtype>
-class AbsValLayer : public NeuronLayer<Dtype> {
- public:
-  explicit AbsValLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "AbsVal"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  /// @copydoc AbsValLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the absolute value inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x} =
-   *            \mathrm{sign}(x) \frac{\partial E}{\partial y}
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+class AbsValLayer: public NeuronLayer<Dtype> {
+  public:
+    explicit AbsValLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "AbsVal";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    /// @copydoc AbsValLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the absolute value inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x} =
+     *            \mathrm{sign}(x) \frac{\partial E}{\partial y}
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -103,40 +116,43 @@ class AbsValLayer : public NeuronLayer<Dtype> {
  *      @f$
  */
 template <typename Dtype>
-class BNLLLayer : public NeuronLayer<Dtype> {
- public:
-  explicit BNLLLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "BNLL"; }
-
- protected:
-  /// @copydoc BNLLLayer
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the BNLL inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 2)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x}
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+class BNLLLayer: public NeuronLayer<Dtype> {
+  public:
+    explicit BNLLLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "BNLL";
+    }
+
+  protected:
+    /// @copydoc BNLLLayer
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the BNLL inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 2)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x}
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 /**
@@ -151,56 +167,60 @@ class BNLLLayer : public NeuronLayer<Dtype> {
  *      the computed outputs @f$ y = |x| @f$
  */
 template <typename Dtype>
-class DropoutLayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides DropoutParameter dropout_param,
-   *     with DropoutLayer options:
-   *   - dropout_ratio (\b optional, default 0.5).
-   *     Sets the probability @f$ p @f$ that any given unit is dropped.
-   */
-  explicit DropoutLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Dropout"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs. At training time, we have @f$
-   *      y_{\mbox{train}} = \left\{
-   *         \begin{array}{ll}
-   *            \frac{x}{1 - p} & \mbox{if } u > p \\
+class DropoutLayer: public NeuronLayer<Dtype> {
+	public:
+		/**
+		 * @param param provides DropoutParameter dropout_param,
+		 *     with DropoutLayer options:
+		 *   - dropout_ratio (\b optional, default 0.5).
+		 *     Sets the probability @f$ p @f$ that any given unit is dropped.
+		 */
+		explicit DropoutLayer(const LayerParameter& param)
+		:
+				NeuronLayer<Dtype>(param) {
+		}
+		virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+				const vector<Blob<Dtype>*>& top);
+		virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+				const vector<Blob<Dtype>*>& top);
+
+		virtual inline const char* type() const {
+			return "Dropout";
+		}
+
+	protected:
+		/**
+		 * @param bottom input Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the inputs @f$ x @f$
+		 * @param top output Blob vector (length 1)
+		 *   -# @f$ (N \times C \times H \times W) @f$
+		 *      the computed outputs. At training time, we have @f$
+		 *      y_{\mbox{train}} = \left\{
+		 *         \begin{array}{ll}
+		 *            \frac{x}{1 - p} & \mbox{if } u > p \\
    *            0 & \mbox{otherwise}
-   *         \end{array} \right.
-   *      @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each
-   *      input at each iteration. At test time, we simply have
-   *      @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$.
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$
-  Blob<unsigned int> rand_vec_;
-  /// the probability @f$ p @f$ of dropping any input
-  Dtype threshold_;
-  /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$
-  Dtype scale_;
-  unsigned int uint_thres_;
+     *         \end{array} \right.
+     *      @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each
+     *      input at each iteration. At test time, we simply have
+     *      @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$.
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$
+    Blob<unsigned int> rand_vec_;
+    /// the probability @f$ p @f$ of dropping any input
+    Dtype threshold_;
+    /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$
+    Dtype scale_;
+    unsigned int uint_thres_;
 };
 
 /**
@@ -209,62 +229,65 @@ class DropoutLayer : public NeuronLayer<Dtype> {
  *        and base @f$ \gamma @f$.
  */
 template <typename Dtype>
-class ExpLayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides ExpParameter exp_param,
-   *     with ExpLayer options:
-   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
-   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
-   *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
-   *         the base @f$ \gamma @f$
-   */
-  explicit ExpLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Exp"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = \gamma ^ {\alpha x + \beta}
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the exp inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x} =
-   *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Dtype inner_scale_, outer_scale_;
+class ExpLayer: public NeuronLayer<Dtype> {
+  public:
+    /**
+     * @param param provides ExpParameter exp_param,
+     *     with ExpLayer options:
+     *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+     *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+     *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
+     *         the base @f$ \gamma @f$
+     */
+    explicit ExpLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Exp";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = \gamma ^ {\alpha x + \beta}
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the exp inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x} =
+     *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Dtype inner_scale_, outer_scale_;
 };
 
 /**
@@ -273,64 +296,67 @@ class ExpLayer : public NeuronLayer<Dtype> {
  *        and base @f$ \gamma @f$.
  */
 template <typename Dtype>
-class LogLayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides LogParameter log_param,
-   *     with LogLayer options:
-   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
-   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
-   *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
-   *         the base @f$ \gamma @f$
-   */
-  explicit LogLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Log"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = log_{\gamma}(\alpha x + \beta)
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the exp inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x} =
-   *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  Dtype base_scale_;
-  Dtype input_scale_, input_shift_;
-  Dtype backward_num_scale_;
+class LogLayer: public NeuronLayer<Dtype> {
+  public:
+    /**
+     * @param param provides LogParameter log_param,
+     *     with LogLayer options:
+     *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+     *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+     *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
+     *         the base @f$ \gamma @f$
+     */
+    explicit LogLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Log";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = log_{\gamma}(\alpha x + \beta)
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the exp inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x} =
+     *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    Dtype base_scale_;
+    Dtype input_scale_, input_shift_;
+    Dtype backward_num_scale_;
 };
 
 /**
@@ -339,71 +365,74 @@ class LogLayer : public NeuronLayer<Dtype> {
  *        and power @f$ \gamma @f$.
  */
 template <typename Dtype>
-class PowerLayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides PowerParameter power_param,
-   *     with PowerLayer options:
-   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
-   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
-   *   - power (\b optional, default 1) the power @f$ \gamma @f$
-   */
-  explicit PowerLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Power"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = (\alpha x + \beta) ^ \gamma
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the power inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x} =
-   *            \frac{\partial E}{\partial y}
-   *            \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} =
-   *            \frac{\partial E}{\partial y}
-   *            \frac{\alpha \gamma y}{\alpha x + \beta}
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  /// @brief @f$ \gamma @f$ from layer_param_.power_param()
-  Dtype power_;
-  /// @brief @f$ \alpha @f$ from layer_param_.power_param()
-  Dtype scale_;
-  /// @brief @f$ \beta @f$ from layer_param_.power_param()
-  Dtype shift_;
-  /// @brief Result of @f$ \alpha \gamma @f$
-  Dtype diff_scale_;
+class PowerLayer: public NeuronLayer<Dtype> {
+  public:
+    /**
+     * @param param provides PowerParameter power_param,
+     *     with PowerLayer options:
+     *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+     *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+     *   - power (\b optional, default 1) the power @f$ \gamma @f$
+     */
+    explicit PowerLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Power";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = (\alpha x + \beta) ^ \gamma
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the power inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x} =
+     *            \frac{\partial E}{\partial y}
+     *            \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} =
+     *            \frac{\partial E}{\partial y}
+     *            \frac{\alpha \gamma y}{\alpha x + \beta}
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    /// @brief @f$ \gamma @f$ from layer_param_.power_param()
+    Dtype power_;
+    /// @brief @f$ \alpha @f$ from layer_param_.power_param()
+    Dtype scale_;
+    /// @brief @f$ \beta @f$ from layer_param_.power_param()
+    Dtype shift_;
+    /// @brief Result of @f$ \alpha \gamma @f$
+    Dtype diff_scale_;
 };
 
 /**
@@ -411,68 +440,70 @@ class PowerLayer : public NeuronLayer<Dtype> {
  *        The simple max is fast to compute, and the function does not saturate.
  */
 template <typename Dtype>
-class ReLULayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides ReLUParameter relu_param,
-   *     with ReLULayer options:
-   *   - negative_slope (\b optional, default 0).
-   *     the value @f$ \nu @f$ by which negative values are multiplied.
-   */
-  explicit ReLULayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "ReLU"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = \max(0, x)
-   *      @f$ by default.  If a non-zero negative_slope @f$ \nu @f$ is provided,
-   *      the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$.
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the ReLU inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x} = \left\{
-   *        \begin{array}{lr}
-   *            0 & \mathrm{if} \; x \le 0 \\
+class ReLULayer: public NeuronLayer<Dtype> {
+  public:
+    /**
+     * @param param provides ReLUParameter relu_param,
+     *     with ReLULayer options:
+     *   - negative_slope (\b optional, default 0).
+     *     the value @f$ \nu @f$ by which negative values are multiplied.
+     */
+    explicit ReLULayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual inline const char* type() const {
+      return "ReLU";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = \max(0, x)
+     *      @f$ by default.  If a non-zero negative_slope @f$ \nu @f$ is provided,
+     *      the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$.
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the ReLU inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x} = \left\{
+     *        \begin{array}{lr}
+     *            0 & \mathrm{if} \; x \le 0 \\
    *            \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0
-   *        \end{array} \right.
-   *      @f$ if propagate_down[0], by default.
-   *      If a non-zero negative_slope @f$ \nu @f$ is provided,
-   *      the computed gradients are @f$
-   *        \frac{\partial E}{\partial x} = \left\{
-   *        \begin{array}{lr}
-   *            \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\
+     *        \end{array} \right.
+     *      @f$ if propagate_down[0], by default.
+     *      If a non-zero negative_slope @f$ \nu @f$ is provided,
+     *      the computed gradients are @f$
+     *        \frac{\partial E}{\partial x} = \left\{
+     *        \begin{array}{lr}
+     *            \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\
    *            \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0
-   *        \end{array} \right.
-   *      @f$.
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+     *        \end{array} \right.
+     *      @f$.
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -481,23 +512,23 @@ class ReLULayer : public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNReLULayer : public ReLULayer<Dtype> {
- public:
+  public:
   explicit CuDNNReLULayer(const LayerParameter& param)
-      : ReLULayer<Dtype>(param), handles_setup_(false) {}
+  : ReLULayer<Dtype>(param), handles_setup_(false) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual ~CuDNNReLULayer();
 
- protected:
+  protected:
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
-  cudnnHandle_t             handle_;
+  cudnnHandle_t handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
 };
@@ -512,50 +543,53 @@ class CuDNNReLULayer : public ReLULayer<Dtype> {
  * The ReLULayer is often a better choice for this reason.
  */
 template <typename Dtype>
-class SigmoidLayer : public NeuronLayer<Dtype> {
- public:
-  explicit SigmoidLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "Sigmoid"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = (1 + \exp(-x))^{-1}
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the sigmoid inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x}
-   *            = \frac{\partial E}{\partial y} y (1 - y)
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+class SigmoidLayer: public NeuronLayer<Dtype> {
+  public:
+    explicit SigmoidLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "Sigmoid";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = (1 + \exp(-x))^{-1}
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the sigmoid inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x}
+     *            = \frac{\partial E}{\partial y} y (1 - y)
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -564,23 +598,23 @@ class SigmoidLayer : public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
- public:
+  public:
   explicit CuDNNSigmoidLayer(const LayerParameter& param)
-      : SigmoidLayer<Dtype>(param), handles_setup_(false) {}
+  : SigmoidLayer<Dtype>(param), handles_setup_(false) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual ~CuDNNSigmoidLayer();
 
- protected:
+  protected:
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
-  cudnnHandle_t             handle_;
+  cudnnHandle_t handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
 };
@@ -595,52 +629,55 @@ class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
  * The ReLULayer is often a better choice for this reason.
  */
 template <typename Dtype>
-class TanHLayer : public NeuronLayer<Dtype> {
- public:
-  explicit TanHLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "TanH"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *        y = \frac{\exp(2x) - 1}{\exp(2x) + 1}
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the sigmoid inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$; Backward fills their diff with
-   *      gradients @f$
-   *        \frac{\partial E}{\partial x}
-   *            = \frac{\partial E}{\partial y}
-   *              \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right)
-   *            = \frac{\partial E}{\partial y} (1 - y^2)
-   *      @f$ if propagate_down[0]
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+class TanHLayer: public NeuronLayer<Dtype> {
+  public:
+    explicit TanHLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "TanH";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *        y = \frac{\exp(2x) - 1}{\exp(2x) + 1}
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the sigmoid inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$; Backward fills their diff with
+     *      gradients @f$
+     *        \frac{\partial E}{\partial x}
+     *            = \frac{\partial E}{\partial y}
+     *              \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right)
+     *            = \frac{\partial E}{\partial y} (1 - y^2)
+     *      @f$ if propagate_down[0]
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 };
 
 #ifdef USE_CUDNN
@@ -649,23 +686,23 @@ class TanHLayer : public NeuronLayer<Dtype> {
  */
 template <typename Dtype>
 class CuDNNTanHLayer : public TanHLayer<Dtype> {
- public:
+  public:
   explicit CuDNNTanHLayer(const LayerParameter& param)
-      : TanHLayer<Dtype>(param), handles_setup_(false) {}
+  : TanHLayer<Dtype>(param), handles_setup_(false) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual ~CuDNNTanHLayer();
 
- protected:
+  protected:
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
-  cudnnHandle_t             handle_;
+  cudnnHandle_t handle_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnTensorDescriptor_t top_desc_;
 };
@@ -676,47 +713,51 @@ class CuDNNTanHLayer : public TanHLayer<Dtype> {
  *        above threshold; 0 otherwise.
  */
 template <typename Dtype>
-class ThresholdLayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides ThresholdParameter threshold_param,
-   *     with ThresholdLayer options:
-   *   - threshold (\b optional, default 0).
-   *     the threshold value @f$ t @f$ to which the input values are compared.
-   */
-  explicit ThresholdLayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Threshold"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times H \times W) @f$
-   *      the computed outputs @f$
-   *       y = \left\{
-   *       \begin{array}{lr}
-   *         0 & \mathrm{if} \; x \le t \\
+class ThresholdLayer: public NeuronLayer<Dtype> {
+  public:
+    /**
+     * @param param provides ThresholdParameter threshold_param,
+     *     with ThresholdLayer options:
+     *   - threshold (\b optional, default 0).
+     *     the threshold value @f$ t @f$ to which the input values are compared.
+     */
+    explicit ThresholdLayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Threshold";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times H \times W) @f$
+     *      the computed outputs @f$
+     *       y = \left\{
+     *       \begin{array}{lr}
+     *         0 & \mathrm{if} \; x \le t \\
    *         1 & \mathrm{if} \; x > t
-   *       \end{array} \right.
-   *      @f$
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  /// @brief Not implemented (non-differentiable function)
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-    NOT_IMPLEMENTED;
-  }
-
-  Dtype threshold_;
+     *       \end{array} \right.
+     *      @f$
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    /// @brief Not implemented (non-differentiable function)
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+      NOT_IMPLEMENTED;
+    }
+
+    Dtype threshold_;
 };
 
 /**
@@ -728,80 +769,83 @@ class ThresholdLayer : public NeuronLayer<Dtype> {
  *        equal to 2. The 1st axis (0-based) is seen as channels.
  */
 template <typename Dtype>
-class PReLULayer : public NeuronLayer<Dtype> {
- public:
-  /**
-   * @param param provides PReLUParameter prelu_param,
-   *     with PReLULayer options:
-   *   - filler (\b optional, FillerParameter,
-   *     default {'type': constant 'value':0.25}).
-   *   - channel_shared (\b optional, default false).
-   *     negative slopes are shared across channels.
-   */
-  explicit PReLULayer(const LayerParameter& param)
-      : NeuronLayer<Dtype>(param) {}
-
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "PReLU"; }
-
- protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times ...) @f$
-   *      the inputs @f$ x @f$
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (N \times C \times ...) @f$
-   *      the computed outputs for each channel @f$i@f$ @f$
-   *        y_i = \max(0, x_i) + a_i \min(0, x_i)
-   *      @f$.
-   */
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  /**
-   * @brief Computes the error gradient w.r.t. the PReLU inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *      respect to the outputs
-   *   -# @f$ (N \times C \times ...) @f$
-   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
-   *      with respect to computed outputs @f$ y @f$
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (N \times C \times ...) @f$
-   *      the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their
-   *      diff with gradients @f$
-   *        \frac{\partial E}{\partial x_i} = \left\{
-   *        \begin{array}{lr}
-   *            a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
+class PReLULayer: public NeuronLayer<Dtype> {
+  public:
+    /**
+     * @param param provides PReLUParameter prelu_param,
+     *     with PReLULayer options:
+     *   - filler (\b optional, FillerParameter,
+     *     default {'type': constant 'value':0.25}).
+     *   - channel_shared (\b optional, default false).
+     *     negative slopes are shared across channels.
+     */
+    explicit PReLULayer(const LayerParameter& param)
+        : NeuronLayer<Dtype>(param) {
+    }
+
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "PReLU";
+    }
+
+  protected:
+    /**
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times ...) @f$
+     *      the inputs @f$ x @f$
+     * @param top output Blob vector (length 1)
+     *   -# @f$ (N \times C \times ...) @f$
+     *      the computed outputs for each channel @f$i@f$ @f$
+     *        y_i = \max(0, x_i) + a_i \min(0, x_i)
+     *      @f$.
+     */
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    /**
+     * @brief Computes the error gradient w.r.t. the PReLU inputs.
+     *
+     * @param top output Blob vector (length 1), providing the error gradient with
+     *      respect to the outputs
+     *   -# @f$ (N \times C \times ...) @f$
+     *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+     *      with respect to computed outputs @f$ y @f$
+     * @param propagate_down see Layer::Backward.
+     * @param bottom input Blob vector (length 1)
+     *   -# @f$ (N \times C \times ...) @f$
+     *      the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their
+     *      diff with gradients @f$
+     *        \frac{\partial E}{\partial x_i} = \left\{
+     *        \begin{array}{lr}
+     *            a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
    *            \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i > 0
-   *        \end{array} \right.
-   *      @f$.
-   *      If param_propagate_down_[0] is true, it fills the diff with gradients
-   *      @f$
-   *        \frac{\partial E}{\partial a_i} = \left\{
-   *        \begin{array}{lr}
-   *            \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
+     *        \end{array} \right.
+     *      @f$.
+     *      If param_propagate_down_[0] is true, it fills the diff with gradients
+     *      @f$
+     *        \frac{\partial E}{\partial a_i} = \left\{
+     *        \begin{array}{lr}
+     *            \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
    *            0 & \mathrm{if} \; x_i > 0
-   *        \end{array} \right.
-   *      @f$.
-   */
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  bool channel_shared_;
-  Blob<Dtype> multiplier_;  // dot multiplier for backward computation of params
-  Blob<Dtype> backward_buff_;  // temporary buffer for backward computation
-  Blob<Dtype> bottom_memory_;  // memory for in-place computation
+     *        \end{array} \right.
+     *      @f$.
+     */
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    bool channel_shared_;
+    Blob<Dtype> multiplier_; // dot multiplier for backward computation of params
+    Blob<Dtype> backward_buff_;  // temporary buffer for backward computation
+    Blob<Dtype> bottom_memory_;  // memory for in-place computation
 };
 
 }  // namespace caffe
diff --git a/include/caffe/python_layer.hpp b/include/caffe/python_layer.hpp
index 19cf18c9..16d1f7fc 100644
--- a/include/caffe/python_layer.hpp
+++ b/include/caffe/python_layer.hpp
@@ -11,55 +11,59 @@ namespace bp = boost::python;
 namespace caffe {
 
 template <typename Dtype>
-class PythonLayer : public Layer<Dtype> {
- public:
-  PythonLayer(PyObject* self, const LayerParameter& param)
-      : Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) { }
+class PythonLayer: public Layer<Dtype> {
+  public:
+    PythonLayer(PyObject* self, const LayerParameter& param)
+        : Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) {
+    }
 
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    try {
-      self_.attr("setup")(bottom, top);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      try {
+        self_.attr("setup")(bottom, top);
+      } catch (bp::error_already_set) {
+        PyErr_Print();
+        throw;
+      }
     }
-  }
 
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    try {
-      self_.attr("reshape")(bottom, top);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      try {
+        self_.attr("reshape")(bottom, top);
+      } catch (bp::error_already_set) {
+        PyErr_Print();
+        throw;
+      }
     }
-  }
 
-  virtual inline const char* type() const { return "Python"; }
+    virtual inline const char* type() const {
+      return "Python";
+    }
 
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-    try {
-      self_.attr("forward")(bottom, top);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top) {
+      try {
+        self_.attr("forward")(bottom, top);
+      } catch (bp::error_already_set) {
+        PyErr_Print();
+        throw;
+      }
     }
-  }
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-    try {
-      self_.attr("backward")(top, propagate_down, bottom);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down,
+        const vector<Blob<Dtype>*>& bottom) {
+      try {
+        self_.attr("backward")(top, propagate_down, bottom);
+      } catch (bp::error_already_set) {
+        PyErr_Print();
+        throw;
+      }
     }
-  }
 
- private:
-  bp::object self_;
+  private:
+    bp::object self_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index c2ced487..2bddb77f 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -16,116 +16,150 @@ namespace caffe {
  */
 template <typename Dtype>
 class Solver {
- public:
-  explicit Solver(const SolverParameter& param);
-  explicit Solver(const string& param_file);
-  void Init(const SolverParameter& param);
-  void InitTrainNet();
-  void InitTestNets();
-  // The main entry of the solver function. In default, iter will be zero. Pass
-  // in a non-zero iter number to resume training for a pre-trained net.
-  virtual void Solve(const char* resume_file = NULL);
-  inline void Solve(const string resume_file) { Solve(resume_file.c_str()); }
-  void Step(int iters);
-  // The Restore function implements how one should restore the solver to a
-  // previously snapshotted state. You should implement the RestoreSolverState()
-  // function that restores the state from a SolverState protocol buffer.
-  void Restore(const char* resume_file);
-  virtual ~Solver() {}
-  inline shared_ptr<Net<Dtype> > net() { return net_; }
-  inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
-    return test_nets_;
-  }
-  int iter() { return iter_; }
-
- protected:
-  // Make and apply the update value for the current iteration.
-  virtual void ApplyUpdate() = 0;
-  // The Solver::Snapshot function implements the basic snapshotting utility
-  // that stores the learned net. You should implement the SnapshotSolverState()
-  // function that produces a SolverState protocol buffer that needs to be
-  // written to disk together with the learned net.
-  void Snapshot();
-  // The test routine
-  void TestAll();
-  void Test(const int test_net_id = 0);
-  virtual void SnapshotSolverState(SolverState* state) = 0;
-  virtual void RestoreSolverState(const SolverState& state) = 0;
-  void DisplayOutputBlobs(const int net_id);
-
-  SolverParameter param_;
-  int iter_;
-  int current_step_;
-  shared_ptr<Net<Dtype> > net_;
-  vector<shared_ptr<Net<Dtype> > > test_nets_;
-
-  DISABLE_COPY_AND_ASSIGN(Solver);
+  public:
+    explicit Solver(const SolverParameter& param);
+    explicit Solver(const string& param_file);
+    void Init(const SolverParameter& param);
+    void InitTrainNet();
+    void InitTestNets();
+    // The main entry of the solver function. In default, iter will be zero. Pass
+    // in a non-zero iter number to resume training for a pre-trained net.
+    virtual void Solve(const char* resume_file = NULL);
+    inline void Solve(const string resume_file) {
+      Solve(resume_file.c_str());
+    }
+    void Step(int iters);
+    // The Restore function implements how one should restore the solver to a
+    // previously snapshotted state. You should implement the RestoreSolverState()
+    // function that restores the state from a SolverState protocol buffer.
+    void Restore(const char* resume_file);
+    virtual ~Solver() {
+    }
+    inline shared_ptr<Net<Dtype> > net() {
+      return net_;
+    }
+    inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
+      return test_nets_;
+    }
+    int iter() {
+      return iter_;
+    }
+
+  protected:
+    // Make and apply the update value for the current iteration.
+    virtual void ApplyUpdate() = 0;
+    // The Solver::Snapshot function implements the basic snapshotting utility
+    // that stores the learned net. You should implement the SnapshotSolverState()
+    // function that produces a SolverState protocol buffer that needs to be
+    // written to disk together with the learned net.
+    void Snapshot();
+    // The test routine
+    void TestAll();
+    void Test(const int test_net_id = 0);
+    virtual void SnapshotSolverState(SolverState* state) = 0;
+    virtual void RestoreSolverState(const SolverState& state) = 0;
+
+    void DisplayOutputBlobs(const int net_id);
+
+    SolverParameter param_;
+    int iter_;
+    int current_step_;
+    shared_ptr<Net<Dtype> > net_;
+    vector<shared_ptr<Net<Dtype> > > test_nets_;
+
+    void ocl_setup();
+  protected:
+    cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+
+    DISABLE_COPY_AND_ASSIGN (Solver);
 };
 
-
 /**
  * @brief Optimizes the parameters of a Net using
  *        stochastic gradient descent (SGD) with momentum.
  */
 template <typename Dtype>
-class SGDSolver : public Solver<Dtype> {
- public:
-  explicit SGDSolver(const SolverParameter& param)
-      : Solver<Dtype>(param) { PreSolve(); }
-  explicit SGDSolver(const string& param_file)
-      : Solver<Dtype>(param_file) { PreSolve(); }
-
-  const vector<shared_ptr<Blob<Dtype> > >& history() { return history_; }
-
- protected:
-  void PreSolve();
-  Dtype GetLearningRate();
-  virtual void ApplyUpdate();
-  virtual void Normalize(int param_id);
-  virtual void Regularize(int param_id);
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-  virtual void ClipGradients();
-  virtual void SnapshotSolverState(SolverState * state);
-  virtual void RestoreSolverState(const SolverState& state);
-  // history maintains the historical momentum data.
-  // update maintains update related data and is not needed in snapshots.
-  // temp maintains other information that might be needed in computation
-  //   of gradients/updates and is not needed in snapshots
-  vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
-
-  DISABLE_COPY_AND_ASSIGN(SGDSolver);
+class SGDSolver: public Solver<Dtype> {
+  public:
+    explicit SGDSolver(const SolverParameter& param)
+        : Solver<Dtype>(param) {
+      PreSolve();
+    }
+    explicit SGDSolver(const string& param_file)
+        : Solver<Dtype>(param_file) {
+      PreSolve();
+    }
+
+    const vector<shared_ptr<Blob<Dtype> > >& history() {
+      return history_;
+    }
+
+  protected:
+    void PreSolve();
+    Dtype GetLearningRate();
+    virtual void ApplyUpdate();
+    virtual void Normalize(int param_id);
+    virtual void Regularize(int param_id);
+    virtual void ComputeUpdateValue(int param_id, Dtype rate);
+    virtual void ClipGradients();
+    virtual void SnapshotSolverState(SolverState * state);
+    virtual void RestoreSolverState(const SolverState& state);
+    // history maintains the historical momentum data.
+    // update maintains update related data and is not needed in snapshots.
+    // temp maintains other information that might be needed in computation
+    //   of gradients/updates and is not needed in snapshots
+    vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
+
+    void ocl_setup();
+  protected:
+    cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+
+    DISABLE_COPY_AND_ASSIGN (SGDSolver);
 };
 
 template <typename Dtype>
-class NesterovSolver : public SGDSolver<Dtype> {
- public:
-  explicit NesterovSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) {}
-  explicit NesterovSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) {}
-
- protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-
-  DISABLE_COPY_AND_ASSIGN(NesterovSolver);
+class NesterovSolver: public SGDSolver<Dtype> {
+  public:
+    explicit NesterovSolver(const SolverParameter& param)
+        : SGDSolver<Dtype>(param) {
+    }
+    explicit NesterovSolver(const string& param_file)
+        : SGDSolver<Dtype>(param_file) {
+    }
+
+  protected:
+    virtual void ComputeUpdateValue(int param_id, Dtype rate);
+
+    void ocl_setup();
+  protected:
+    cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+
+    DISABLE_COPY_AND_ASSIGN (NesterovSolver);
 };
 
 template <typename Dtype>
-class AdaGradSolver : public SGDSolver<Dtype> {
- public:
-  explicit AdaGradSolver(const SolverParameter& param)
-      : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
-  explicit AdaGradSolver(const string& param_file)
-      : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
-
- protected:
-  virtual void ComputeUpdateValue(int param_id, Dtype rate);
-  void constructor_sanity_check() {
-    CHECK_EQ(0, this->param_.momentum())
-        << "Momentum cannot be used with AdaGrad.";
-  }
-
-  DISABLE_COPY_AND_ASSIGN(AdaGradSolver);
+class AdaGradSolver: public SGDSolver<Dtype> {
+  public:
+    explicit AdaGradSolver(const SolverParameter& param)
+        : SGDSolver<Dtype>(param) {
+      constructor_sanity_check();
+    }
+    explicit AdaGradSolver(const string& param_file)
+        : SGDSolver<Dtype>(param_file) {
+      constructor_sanity_check();
+    }
+
+  protected:
+    virtual void ComputeUpdateValue(int param_id, Dtype rate);
+    void constructor_sanity_check() {
+      CHECK_EQ(0, this->param_.momentum())
+          << "Momentum cannot be used with AdaGrad.";
+    }
+
+    void ocl_setup();
+  protected:
+    cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel;
+    DISABLE_COPY_AND_ASSIGN (AdaGradSolver);
 };
 
 template <typename Dtype>
@@ -134,13 +168,13 @@ Solver<Dtype>* GetSolver(const SolverParameter& param) {
 
   switch (type) {
   case SolverParameter_SolverType_SGD:
-      return new SGDSolver<Dtype>(param);
+    return new SGDSolver<Dtype>(param);
   case SolverParameter_SolverType_NESTEROV:
-      return new NesterovSolver<Dtype>(param);
+    return new NesterovSolver<Dtype>(param);
   case SolverParameter_SolverType_ADAGRAD:
-      return new AdaGradSolver<Dtype>(param);
+    return new AdaGradSolver<Dtype>(param);
   default:
-      LOG(FATAL) << "Unknown SolverType: " << type;
+    LOG(FATAL) << "Unknown SolverType: " << type;
   }
   return (Solver<Dtype>*) NULL;
 }
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 1b726de9..4092b5ac 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #ifndef CAFFE_SYNCEDMEM_HPP_
 #define CAFFE_SYNCEDMEM_HPP_
 
@@ -31,7 +57,6 @@ inline void CaffeFreeHost(void* ptr) {
   free(ptr);
 }
 
-
 /**
  * @brief Manages memory allocation and synchronization between the host (CPU)
  *        and device (GPU).
@@ -39,35 +64,62 @@ inline void CaffeFreeHost(void* ptr) {
  * TODO(dox): more thorough description.
  */
 class SyncedMemory {
- public:
-  SyncedMemory()
-      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-        own_cpu_data_(false) {}
-  explicit SyncedMemory(size_t size)
-      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
-        own_cpu_data_(false) {}
-  ~SyncedMemory();
-  const void* cpu_data();
-  void set_cpu_data(void* data);
-  const void* gpu_data();
-  void* mutable_cpu_data();
-  void* mutable_gpu_data();
-  enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
-  SyncedHead head() { return head_; }
-  size_t size() { return size_; }
+  public:
+    SyncedMemory()
+        : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), own_cpu_data_(
+            false), data_layer_(false) {
+#ifndef CPU_ONLY
+     	ocl_setup();
+#endif
+    }
+    explicit SyncedMemory(size_t size)
+        : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), own_cpu_data_(
+            false), data_layer_(false) {
+#ifndef CPU_ONLY
+	ocl_setup();
+#endif
+    }
 
- private:
-  void to_cpu();
-  void to_gpu();
-  void* cpu_ptr_;
-  void* gpu_ptr_;
-  size_t size_;
-  SyncedHead head_;
-  bool own_cpu_data_;
+    ~SyncedMemory();
+    const void* cpu_data();
+    void set_cpu_data(void* data);
+    const void* gpu_data();
+    const void* gpu_cache_data();
+    void* mutable_cpu_data();
+    void* mutable_gpu_data();
+    enum SyncedHead {
+      UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED
+    };
+    SyncedHead head() {
+      return head_;
+    }
+    size_t size() {
+      return size_;
+    }
+    void set_data_layer() {
+      data_layer_ = true;
+    }
+#ifndef CPU_ONLY
+  private:
+    void ocl_setup();
+#endif
+  protected:
+    cl_kernel oclmem_kernel;
 
-  DISABLE_COPY_AND_ASSIGN(SyncedMemory);
-};  // class SyncedMemory
+  private:
+    void to_cpu();
+    void to_gpu();
+    void* cpu_ptr_;
+    void* gpu_ptr_;
+    void* gpu_cache_ptr_;
+    size_t size_;
+    SyncedHead head_;
+    bool own_cpu_data_;
+    bool data_layer_;
+    DISABLE_COPY_AND_ASSIGN (SyncedMemory);
+};
+// class SyncedMemory
 
-}  // namespace caffe
+}// namespace caffe
 
 #endif  // CAFFE_SYNCEDMEM_HPP_
diff --git a/include/caffe/test/.test_gradient_check_util.hpp.swo b/include/caffe/test/.test_gradient_check_util.hpp.swo
new file mode 100644
index 00000000..e3ebfc99
Binary files /dev/null and b/include/caffe/test/.test_gradient_check_util.hpp.swo differ
diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp
index fc156091..401e2136 100644
--- a/include/caffe/test/test_caffe_main.hpp
+++ b/include/caffe/test/test_caffe_main.hpp
@@ -15,12 +15,12 @@ using std::cout;
 using std::endl;
 
 #ifdef CMAKE_BUILD
-  #include "caffe_config.h"
+#include "caffe_config.h"
 #else
-  #define CUDA_TEST_DEVICE -1
-  #define CMAKE_SOURCE_DIR "src/"
-  #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
+#define OPENCL_TEST_DEVICE -1
+#define CMAKE_SOURCE_DIR "src/"
+#define EXAMPLES_SOURCE_DIR "examples/"
+#define CMAKE_EXT ""
 #endif
 
 int main(int argc, char** argv);
@@ -28,48 +28,48 @@ int main(int argc, char** argv);
 namespace caffe {
 
 template <typename TypeParam>
-class MultiDeviceTest : public ::testing::Test {
- public:
-  typedef typename TypeParam::Dtype Dtype;
- protected:
-  MultiDeviceTest() {
-    Caffe::set_mode(TypeParam::device);
-  }
-  virtual ~MultiDeviceTest() {}
+class MultiDeviceTest: public ::testing::Test {
+  public:
+    typedef typename TypeParam::Dtype Dtype;
+  protected:
+    MultiDeviceTest() {
+      Caffe::set_mode(TypeParam::device);
+    }
+    virtual ~MultiDeviceTest() {
+    }
 };
 
 typedef ::testing::Types<float, double> TestDtypes;
 
 template <typename TypeParam>
 struct CPUDevice {
-  typedef TypeParam Dtype;
-  static const Caffe::Brew device = Caffe::CPU;
+    typedef TypeParam Dtype;
+    static const Caffe::Brew device = Caffe::CPU;
 };
 
 template <typename Dtype>
-class CPUDeviceTest : public MultiDeviceTest<CPUDevice<Dtype> > {
+class CPUDeviceTest: public MultiDeviceTest<CPUDevice<Dtype> > {
 };
 
 #ifdef CPU_ONLY
 
 typedef ::testing::Types<CPUDevice<float>,
-                         CPUDevice<double> > TestDtypesAndDevices;
+CPUDevice<double> > TestDtypesAndDevices;
 
 #else
 
 template <typename TypeParam>
 struct GPUDevice {
-  typedef TypeParam Dtype;
-  static const Caffe::Brew device = Caffe::GPU;
+    typedef TypeParam Dtype;
+    static const Caffe::Brew device = Caffe::GPU;
 };
 
 template <typename Dtype>
-class GPUDeviceTest : public MultiDeviceTest<GPUDevice<Dtype> > {
+class GPUDeviceTest: public MultiDeviceTest<GPUDevice<Dtype> > {
 };
 
-typedef ::testing::Types<CPUDevice<float>, CPUDevice<double>,
-                         GPUDevice<float>, GPUDevice<double> >
-                         TestDtypesAndDevices;
+typedef ::testing::Types<CPUDevice<float>, CPUDevice<double>, GPUDevice<float>,
+    GPUDevice<double> > TestDtypesAndDevices;
 
 #endif
 
diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp
index cc5dcbad..081ce203 100644
--- a/include/caffe/test/test_gradient_check_util.hpp
+++ b/include/caffe/test/test_gradient_check_util.hpp
@@ -17,56 +17,57 @@ namespace caffe {
 // top blobs, and checks the gradient.
 template <typename Dtype>
 class GradientChecker {
- public:
-  // kink and kink_range specify an ignored nonsmooth region of the form
-  // kink - kink_range <= |feature value| <= kink + kink_range,
-  // which accounts for all nonsmoothness in use by caffe
-  GradientChecker(const Dtype stepsize, const Dtype threshold,
-      const unsigned int seed = 1701, const Dtype kink = 0.,
-      const Dtype kink_range = -1)
-      : stepsize_(stepsize), threshold_(threshold), seed_(seed),
-        kink_(kink), kink_range_(kink_range) {}
-  // Checks the gradient of a layer, with provided bottom layers and top
-  // layers.
-  // Note that after the gradient check, we do not guarantee that the data
-  // stored in the layer parameters and the blobs are unchanged.
-  void CheckGradient(Layer<Dtype>* layer, const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top, int check_bottom = -1) {
+  public:
+    // kink and kink_range specify an ignored nonsmooth region of the form
+    // kink - kink_range <= |feature value| <= kink + kink_range,
+    // which accounts for all nonsmoothness in use by caffe
+    GradientChecker(const Dtype stepsize, const Dtype threshold,
+        const unsigned int seed = 1701, const Dtype kink = 0.,
+        const Dtype kink_range = -1)
+        : stepsize_(stepsize), threshold_(threshold), seed_(seed), kink_(kink), kink_range_(
+            kink_range) {
+    }
+    // Checks the gradient of a layer, with provided bottom layers and top
+    // layers.
+    // Note that after the gradient check, we do not guarantee that the data
+    // stored in the layer parameters and the blobs are unchanged.
+    void CheckGradient(Layer<Dtype>* layer, const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top, int check_bottom = -1) {
       layer->SetUp(bottom, top);
       CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1);
-  }
-  void CheckGradientExhaustive(Layer<Dtype>* layer,
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-      int check_bottom = -1);
+    }
+    void CheckGradientExhaustive(Layer<Dtype>* layer,
+        const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
+        int check_bottom = -1);
 
-  // CheckGradientEltwise can be used to test layers that perform element-wise
-  // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when
-  // i != j.
-  void CheckGradientEltwise(Layer<Dtype>* layer,
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+    // CheckGradientEltwise can be used to test layers that perform element-wise
+    // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when
+    // i != j.
+    void CheckGradientEltwise(Layer<Dtype>* layer,
+        const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
 
-  void CheckGradientSingle(Layer<Dtype>* layer,
-      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
-      int check_bottom, int top_id, int top_data_id, bool element_wise = false);
+    void CheckGradientSingle(Layer<Dtype>* layer,
+        const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
+        int check_bottom, int top_id, int top_data_id,
+        bool element_wise = false);
 
-  // Checks the gradient of a network. This network should not have any data
-  // layers or loss layers, since the function does not explicitly deal with
-  // such cases yet. All input blobs and parameter blobs are going to be
-  // checked, layer-by-layer to avoid numerical problems to accumulate.
-  void CheckGradientNet(const Net<Dtype>& net,
-      const vector<Blob<Dtype>*>& input);
+    // Checks the gradient of a network. This network should not have any data
+    // layers or loss layers, since the function does not explicitly deal with
+    // such cases yet. All input blobs and parameter blobs are going to be
+    // checked, layer-by-layer to avoid numerical problems to accumulate.
+    void CheckGradientNet(const Net<Dtype>& net,
+        const vector<Blob<Dtype>*>& input);
 
- protected:
-  Dtype GetObjAndGradient(const Layer<Dtype>& layer,
-      const vector<Blob<Dtype>*>& top, int top_id = -1, int top_data_id = -1);
-  Dtype stepsize_;
-  Dtype threshold_;
-  unsigned int seed_;
-  Dtype kink_;
-  Dtype kink_range_;
+  protected:
+    Dtype GetObjAndGradient(const Layer<Dtype>& layer,
+        const vector<Blob<Dtype>*>& top, int top_id = -1, int top_data_id = -1);
+    Dtype stepsize_;
+    Dtype threshold_;
+    unsigned int seed_;
+    Dtype kink_;
+    Dtype kink_range_;
 };
 
-
 template <typename Dtype>
 void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
@@ -107,8 +108,8 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
   GetObjAndGradient(*layer, top, top_id, top_data_id);
   layer->Backward(top, propagate_down, bottom);
   // Store computed gradients for all checked blobs
-  vector<shared_ptr<Blob<Dtype> > >
-      computed_gradient_blobs(blobs_to_check.size());
+  vector < shared_ptr<Blob<Dtype> >
+      > computed_gradient_blobs(blobs_to_check.size());
   for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) {
     Blob<Dtype>* current_blob = blobs_to_check[blob_id];
     computed_gradient_blobs[blob_id].reset(new Blob<Dtype>());
@@ -143,18 +144,18 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
         current_blob->mutable_cpu_data()[feat_id] += stepsize_;
         Caffe::set_random_seed(seed_);
         layer->Forward(bottom, top);
-        positive_objective =
-            GetObjAndGradient(*layer, top, top_id, top_data_id);
+        positive_objective = GetObjAndGradient(*layer, top, top_id,
+            top_data_id);
         // Compute loss with stepsize_ subtracted from input.
         current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2;
         Caffe::set_random_seed(seed_);
         layer->Forward(bottom, top);
-        negative_objective =
-            GetObjAndGradient(*layer, top, top_id, top_data_id);
+        negative_objective = GetObjAndGradient(*layer, top, top_id,
+            top_data_id);
         // Recover original input value.
         current_blob->mutable_cpu_data()[feat_id] += stepsize_;
-        estimated_gradient = (positive_objective - negative_objective) /
-            stepsize_ / 2.;
+        estimated_gradient = (positive_objective - negative_objective)
+            / stepsize_ / 2.;
       }
       Dtype computed_gradient = computed_gradients[feat_id];
       Dtype feature = current_blob->cpu_data()[feat_id];
@@ -167,11 +168,10 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
         Dtype scale = std::max(
             std::max(fabs(computed_gradient), fabs(estimated_gradient)), 1.);
         EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale)
-          << "debug: (top_id, top_data_id, blob_id, feat_id)="
-          << top_id << "," << top_data_id << "," << blob_id << "," << feat_id
-          << "; feat = " << feature
-          << "; objective+ = " << positive_objective
-          << "; objective- = " << negative_objective;
+            << "debug: (top_id, top_data_id, blob_id, feat_id)=" << top_id
+            << "," << top_data_id << "," << blob_id << "," << feat_id
+            << "; feat = " << feature << "; objective+ = " << positive_objective
+            << "; objective- = " << negative_objective;
       }
       // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id];
       // LOG(ERROR) << "computed gradient: " << computed_gradient
@@ -211,11 +211,11 @@ void GradientChecker<Dtype>::CheckGradientEltwise(Layer<Dtype>* layer,
 }
 
 template <typename Dtype>
-void GradientChecker<Dtype>::CheckGradientNet(
-    const Net<Dtype>& net, const vector<Blob<Dtype>*>& input) {
+void GradientChecker<Dtype>::CheckGradientNet(const Net<Dtype>& net,
+    const vector<Blob<Dtype>*>& input) {
   const vector<shared_ptr<Layer<Dtype> > >& layers = net.layers();
-  vector<vector<Blob<Dtype>*> >& bottom_vecs = net.bottom_vecs();
-  vector<vector<Blob<Dtype>*> >& top_vecs = net.top_vecs();
+  vector < vector<Blob<Dtype>*> > &bottom_vecs = net.bottom_vecs();
+  vector < vector<Blob<Dtype>*> > &top_vecs = net.top_vecs();
   for (int i = 0; i < layers.size(); ++i) {
     net.Forward(input);
     LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name();
diff --git a/include/caffe/util/benchmark.hpp b/include/caffe/util/benchmark.hpp
index d6358277..f48be453 100644
--- a/include/caffe/util/benchmark.hpp
+++ b/include/caffe/util/benchmark.hpp
@@ -8,43 +8,50 @@
 namespace caffe {
 
 class Timer {
- public:
-  Timer();
-  virtual ~Timer();
-  virtual void Start();
-  virtual void Stop();
-  virtual float MilliSeconds();
-  virtual float MicroSeconds();
-  virtual float Seconds();
-
-  inline bool initted() { return initted_; }
-  inline bool running() { return running_; }
-  inline bool has_run_at_least_once() { return has_run_at_least_once_; }
-
- protected:
-  void Init();
-
-  bool initted_;
-  bool running_;
-  bool has_run_at_least_once_;
+  public:
+    Timer();
+    virtual ~Timer();
+    virtual void Start();
+    virtual void Stop();
+    virtual float MilliSeconds();
+    virtual float MicroSeconds();
+    virtual float Seconds();
+
+    inline bool initted() {
+      return initted_;
+    }
+    inline bool running() {
+      return running_;
+    }
+    inline bool has_run_at_least_once() {
+      return has_run_at_least_once_;
+    }
+
+  protected:
+    void Init();
+
+    bool initted_;
+    bool running_;
+    bool has_run_at_least_once_;
 #ifndef CPU_ONLY
-  cudaEvent_t start_gpu_;
-  cudaEvent_t stop_gpu_;
+    //cudaEvent_t start_gpu_;
+    //cudaEvent_t stop_gpu_;
 #endif
-  boost::posix_time::ptime start_cpu_;
-  boost::posix_time::ptime stop_cpu_;
-  float elapsed_milliseconds_;
-  float elapsed_microseconds_;
+    boost::posix_time::ptime start_cpu_;
+    boost::posix_time::ptime stop_cpu_;
+    float elapsed_milliseconds_;
+    float elapsed_microseconds_;
 };
 
-class CPUTimer : public Timer {
- public:
-  explicit CPUTimer();
-  virtual ~CPUTimer() {}
-  virtual void Start();
-  virtual void Stop();
-  virtual float MilliSeconds();
-  virtual float MicroSeconds();
+class CPUTimer: public Timer {
+  public:
+    explicit CPUTimer();
+    virtual ~CPUTimer() {
+    }
+    virtual void Start();
+    virtual void Stop();
+    virtual float MilliSeconds();
+    virtual float MicroSeconds();
 };
 
 }  // namespace caffe
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
index b531dd5f..1994c48a 100644
--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@@ -17,114 +17,114 @@
 inline const char* cudnnGetErrorString(cudnnStatus_t status) {
   switch (status) {
     case CUDNN_STATUS_SUCCESS:
-      return "CUDNN_STATUS_SUCCESS";
+    return "CUDNN_STATUS_SUCCESS";
     case CUDNN_STATUS_NOT_INITIALIZED:
-      return "CUDNN_STATUS_NOT_INITIALIZED";
+    return "CUDNN_STATUS_NOT_INITIALIZED";
     case CUDNN_STATUS_ALLOC_FAILED:
-      return "CUDNN_STATUS_ALLOC_FAILED";
+    return "CUDNN_STATUS_ALLOC_FAILED";
     case CUDNN_STATUS_BAD_PARAM:
-      return "CUDNN_STATUS_BAD_PARAM";
+    return "CUDNN_STATUS_BAD_PARAM";
     case CUDNN_STATUS_INTERNAL_ERROR:
-      return "CUDNN_STATUS_INTERNAL_ERROR";
+    return "CUDNN_STATUS_INTERNAL_ERROR";
     case CUDNN_STATUS_INVALID_VALUE:
-      return "CUDNN_STATUS_INVALID_VALUE";
+    return "CUDNN_STATUS_INVALID_VALUE";
     case CUDNN_STATUS_ARCH_MISMATCH:
-      return "CUDNN_STATUS_ARCH_MISMATCH";
+    return "CUDNN_STATUS_ARCH_MISMATCH";
     case CUDNN_STATUS_MAPPING_ERROR:
-      return "CUDNN_STATUS_MAPPING_ERROR";
+    return "CUDNN_STATUS_MAPPING_ERROR";
     case CUDNN_STATUS_EXECUTION_FAILED:
-      return "CUDNN_STATUS_EXECUTION_FAILED";
+    return "CUDNN_STATUS_EXECUTION_FAILED";
     case CUDNN_STATUS_NOT_SUPPORTED:
-      return "CUDNN_STATUS_NOT_SUPPORTED";
+    return "CUDNN_STATUS_NOT_SUPPORTED";
     case CUDNN_STATUS_LICENSE_ERROR:
-      return "CUDNN_STATUS_LICENSE_ERROR";
+    return "CUDNN_STATUS_LICENSE_ERROR";
   }
   return "Unknown cudnn status";
 }
 
 namespace caffe {
 
-namespace cudnn {
-
-template <typename Dtype> class dataType;
-template<> class dataType<float>  {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
-  static float oneval, zeroval;
-  static const void *one, *zero;
-};
-template<> class dataType<double> {
- public:
-  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
-  static double oneval, zeroval;
-  static const void *one, *zero;
-};
-
-template <typename Dtype>
-inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) {
-  CUDNN_CHECK(cudnnCreateTensorDescriptor(desc));
-}
-
-template <typename Dtype>
-inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
-    int n, int c, int h, int w,
-    int stride_n, int stride_c, int stride_h, int stride_w) {
-  CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType<Dtype>::type,
-        n, c, h, w, stride_n, stride_c, stride_h, stride_w));
-}
-
-template <typename Dtype>
-inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
-    int n, int c, int h, int w) {
-  const int stride_w = 1;
-  const int stride_h = w * stride_w;
-  const int stride_c = h * stride_h;
-  const int stride_n = c * stride_c;
-  setTensor4dDesc<Dtype>(desc, n, c, h, w,
-                         stride_n, stride_c, stride_h, stride_w);
-}
-
-template <typename Dtype>
-inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
-    int n, int c, int h, int w) {
-  CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
-  CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
-      n, c, h, w));
-}
-
-template <typename Dtype>
-inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) {
-  CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv));
-}
-
-template <typename Dtype>
-inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
-    cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
-    int pad_h, int pad_w, int stride_h, int stride_w) {
-  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
-      pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
-}
-
-template <typename Dtype>
-inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
-    PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode,
-    int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) {
-  switch (poolmethod) {
-  case PoolingParameter_PoolMethod_MAX:
-    *mode = CUDNN_POOLING_MAX;
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-  CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
-  CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w,
-        pad_h, pad_w, stride_h, stride_w));
-}
-
-}  // namespace cudnn
+  namespace cudnn {
+
+    template <typename Dtype> class dataType;
+    template<> class dataType<float> {
+      public:
+      static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
+      static float oneval, zeroval;
+      static const void *one, *zero;
+    };
+    template<> class dataType<double> {
+      public:
+      static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
+      static double oneval, zeroval;
+      static const void *one, *zero;
+    };
+
+    template <typename Dtype>
+    inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) {
+      CUDNN_CHECK(cudnnCreateTensorDescriptor(desc));
+    }
+
+    template <typename Dtype>
+    inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
+        int n, int c, int h, int w,
+        int stride_n, int stride_c, int stride_h, int stride_w) {
+      CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType<Dtype>::type,
+              n, c, h, w, stride_n, stride_c, stride_h, stride_w));
+    }
+
+    template <typename Dtype>
+    inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc,
+        int n, int c, int h, int w) {
+      const int stride_w = 1;
+      const int stride_h = w * stride_w;
+      const int stride_c = h * stride_h;
+      const int stride_n = c * stride_c;
+      setTensor4dDesc<Dtype>(desc, n, c, h, w,
+          stride_n, stride_c, stride_h, stride_w);
+    }
+
+    template <typename Dtype>
+    inline void createFilterDesc(cudnnFilterDescriptor_t* desc,
+        int n, int c, int h, int w) {
+      CUDNN_CHECK(cudnnCreateFilterDescriptor(desc));
+      CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType<Dtype>::type,
+              n, c, h, w));
+    }
+
+    template <typename Dtype>
+    inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) {
+      CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv));
+    }
+
+    template <typename Dtype>
+    inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
+        cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
+        int pad_h, int pad_w, int stride_h, int stride_w) {
+      CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
+              pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
+    }
+
+    template <typename Dtype>
+    inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc,
+        PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode,
+        int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) {
+      switch (poolmethod) {
+        case PoolingParameter_PoolMethod_MAX:
+        *mode = CUDNN_POOLING_MAX;
+        break;
+        case PoolingParameter_PoolMethod_AVE:
+        *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+        break;
+        default:
+        LOG(FATAL) << "Unknown pooling method.";
+      }
+      CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc));
+      CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w,
+              pad_h, pad_w, stride_h, stride_w));
+    }
+
+  }  // namespace cudnn
 
 }  // namespace caffe
 
diff --git a/include/caffe/util/db.hpp b/include/caffe/util/db.hpp
index 59ec3d39..a872fb07 100644
--- a/include/caffe/util/db.hpp
+++ b/include/caffe/util/db.hpp
@@ -6,43 +6,52 @@
 #include "caffe/common.hpp"
 #include "caffe/proto/caffe.pb.h"
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
-enum Mode { READ, WRITE, NEW };
+enum Mode {
+  READ, WRITE, NEW
+};
 
 class Cursor {
- public:
-  Cursor() { }
-  virtual ~Cursor() { }
-  virtual void SeekToFirst() = 0;
-  virtual void Next() = 0;
-  virtual string key() = 0;
-  virtual string value() = 0;
-  virtual bool valid() = 0;
-
-  DISABLE_COPY_AND_ASSIGN(Cursor);
+  public:
+    Cursor() {
+    }
+    virtual ~Cursor() {
+    }
+    virtual void SeekToFirst() = 0;
+    virtual void Next() = 0;
+    virtual string key() = 0;
+    virtual string value() = 0;
+    virtual bool valid() = 0;
+
+    DISABLE_COPY_AND_ASSIGN (Cursor);
 };
 
 class Transaction {
- public:
-  Transaction() { }
-  virtual ~Transaction() { }
-  virtual void Put(const string& key, const string& value) = 0;
-  virtual void Commit() = 0;
-
-  DISABLE_COPY_AND_ASSIGN(Transaction);
+  public:
+    Transaction() {
+    }
+    virtual ~Transaction() {
+    }
+    virtual void Put(const string& key, const string& value) = 0;
+    virtual void Commit() = 0;
+
+    DISABLE_COPY_AND_ASSIGN (Transaction);
 };
 
 class DB {
- public:
-  DB() { }
-  virtual ~DB() { }
-  virtual void Open(const string& source, Mode mode) = 0;
-  virtual void Close() = 0;
-  virtual Cursor* NewCursor() = 0;
-  virtual Transaction* NewTransaction() = 0;
-
-  DISABLE_COPY_AND_ASSIGN(DB);
+  public:
+    DB() {
+    }
+    virtual ~DB() {
+    }
+    virtual void Open(const string& source, Mode mode) = 0;
+    virtual void Close() = 0;
+    virtual Cursor* NewCursor() = 0;
+    virtual Transaction* NewTransaction() = 0;
+
+    DISABLE_COPY_AND_ASSIGN (DB);
 };
 
 DB* GetDB(DataParameter::DB backend);
diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp
index 10623554..c0f6ab62 100644
--- a/include/caffe/util/db_leveldb.hpp
+++ b/include/caffe/util/db_leveldb.hpp
@@ -8,65 +8,86 @@
 
 #include "caffe/util/db.hpp"
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
-class LevelDBCursor : public Cursor {
- public:
-  explicit LevelDBCursor(leveldb::Iterator* iter)
-    : iter_(iter) { SeekToFirst(); }
-  ~LevelDBCursor() { delete iter_; }
-  virtual void SeekToFirst() { iter_->SeekToFirst(); }
-  virtual void Next() { iter_->Next(); }
-  virtual string key() { return iter_->key().ToString(); }
-  virtual string value() { return iter_->value().ToString(); }
-  virtual bool valid() { return iter_->Valid(); }
+class LevelDBCursor: public Cursor {
+  public:
+    explicit LevelDBCursor(leveldb::Iterator* iter)
+        : iter_(iter) {
+      SeekToFirst();
+    }
+    ~LevelDBCursor() {
+      delete iter_;
+    }
+    virtual void SeekToFirst() {
+      iter_->SeekToFirst();
+    }
+    virtual void Next() {
+      iter_->Next();
+    }
+    virtual string key() {
+      return iter_->key().ToString();
+    }
+    virtual string value() {
+      return iter_->value().ToString();
+    }
+    virtual bool valid() {
+      return iter_->Valid();
+    }
 
- private:
-  leveldb::Iterator* iter_;
+  private:
+    leveldb::Iterator* iter_;
 };
 
-class LevelDBTransaction : public Transaction {
- public:
-  explicit LevelDBTransaction(leveldb::DB* db) : db_(db) { CHECK_NOTNULL(db_); }
-  virtual void Put(const string& key, const string& value) {
-    batch_.Put(key, value);
-  }
-  virtual void Commit() {
-    leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_);
-    CHECK(status.ok()) << "Failed to write batch to leveldb "
-                       << std::endl << status.ToString();
-  }
+class LevelDBTransaction: public Transaction {
+  public:
+    explicit LevelDBTransaction(leveldb::DB* db)
+        : db_(db) {
+      CHECK_NOTNULL(db_);
+    }
+    virtual void Put(const string& key, const string& value) {
+      batch_.Put(key, value);
+    }
+    virtual void Commit() {
+      leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_);
+      CHECK(status.ok()) << "Failed to write batch to leveldb " << std::endl
+          << status.ToString();
+    }
 
- private:
-  leveldb::DB* db_;
-  leveldb::WriteBatch batch_;
+  private:
+    leveldb::DB* db_;
+    leveldb::WriteBatch batch_;
 
-  DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
+    DISABLE_COPY_AND_ASSIGN (LevelDBTransaction);
 };
 
-class LevelDB : public DB {
- public:
-  LevelDB() : db_(NULL) { }
-  virtual ~LevelDB() { Close(); }
-  virtual void Open(const string& source, Mode mode);
-  virtual void Close() {
-    if (db_ != NULL) {
-      delete db_;
-      db_ = NULL;
+class LevelDB: public DB {
+  public:
+    LevelDB()
+        : db_(NULL) {
+    }
+    virtual ~LevelDB() {
+      Close();
+    }
+    virtual void Open(const string& source, Mode mode);
+    virtual void Close() {
+      if (db_ != NULL) {
+        delete db_;
+        db_ = NULL;
+      }
+    }
+    virtual LevelDBCursor* NewCursor() {
+      return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
+    }
+    virtual LevelDBTransaction* NewTransaction() {
+      return new LevelDBTransaction(db_);
     }
-  }
-  virtual LevelDBCursor* NewCursor() {
-    return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions()));
-  }
-  virtual LevelDBTransaction* NewTransaction() {
-    return new LevelDBTransaction(db_);
-  }
 
- private:
-  leveldb::DB* db_;
+  private:
+    leveldb::DB* db_;
 };
 
-
 }  // namespace db
 }  // namespace caffe
 
diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp
index cc7c90af..232b439a 100644
--- a/include/caffe/util/db_lmdb.hpp
+++ b/include/caffe/util/db_lmdb.hpp
@@ -7,82 +7,97 @@
 
 #include "caffe/util/db.hpp"
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
 inline void MDB_CHECK(int mdb_status) {
   CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
 }
 
-class LMDBCursor : public Cursor {
- public:
-  explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor)
-    : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) {
-    SeekToFirst();
-  }
-  virtual ~LMDBCursor() {
-    mdb_cursor_close(mdb_cursor_);
-    mdb_txn_abort(mdb_txn_);
-  }
-  virtual void SeekToFirst() { Seek(MDB_FIRST); }
-  virtual void Next() { Seek(MDB_NEXT); }
-  virtual string key() {
-    return string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
-  }
-  virtual string value() {
-    return string(static_cast<const char*>(mdb_value_.mv_data),
-        mdb_value_.mv_size);
-  }
-  virtual bool valid() { return valid_; }
+class LMDBCursor: public Cursor {
+  public:
+    explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor)
+        : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) {
+      SeekToFirst();
+    }
+    virtual ~LMDBCursor() {
+      mdb_cursor_close(mdb_cursor_);
+      mdb_txn_abort(mdb_txn_);
+    }
+    virtual void SeekToFirst() {
+      Seek (MDB_FIRST);
+    }
+    virtual void Next() {
+      Seek (MDB_NEXT);
+    }
+    virtual string key() {
+      return string(static_cast<const char*>(mdb_key_.mv_data),
+          mdb_key_.mv_size);
+    }
+    virtual string value() {
+      return string(static_cast<const char*>(mdb_value_.mv_data),
+          mdb_value_.mv_size);
+    }
+    virtual bool valid() {
+      return valid_;
+    }
 
- private:
-  void Seek(MDB_cursor_op op) {
-    int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
-    if (mdb_status == MDB_NOTFOUND) {
-      valid_ = false;
-    } else {
-      MDB_CHECK(mdb_status);
-      valid_ = true;
+  private:
+    void Seek(MDB_cursor_op op) {
+      int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
+      if (mdb_status == MDB_NOTFOUND) {
+        valid_ = false;
+      } else {
+        MDB_CHECK(mdb_status);
+        valid_ = true;
+      }
     }
-  }
 
-  MDB_txn* mdb_txn_;
-  MDB_cursor* mdb_cursor_;
-  MDB_val mdb_key_, mdb_value_;
-  bool valid_;
+    MDB_txn* mdb_txn_;
+    MDB_cursor* mdb_cursor_;
+    MDB_val mdb_key_, mdb_value_;
+    bool valid_;
 };
 
-class LMDBTransaction : public Transaction {
- public:
-  explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn)
-    : mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) { }
-  virtual void Put(const string& key, const string& value);
-  virtual void Commit() { MDB_CHECK(mdb_txn_commit(mdb_txn_)); }
+class LMDBTransaction: public Transaction {
+  public:
+    explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn)
+        : mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) {
+    }
+    virtual void Put(const string& key, const string& value);
+    virtual void Commit() {
+      MDB_CHECK(mdb_txn_commit(mdb_txn_));
+    }
 
- private:
-  MDB_dbi* mdb_dbi_;
-  MDB_txn* mdb_txn_;
+  private:
+    MDB_dbi* mdb_dbi_;
+    MDB_txn* mdb_txn_;
 
-  DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
+    DISABLE_COPY_AND_ASSIGN (LMDBTransaction);
 };
 
-class LMDB : public DB {
- public:
-  LMDB() : mdb_env_(NULL) { }
-  virtual ~LMDB() { Close(); }
-  virtual void Open(const string& source, Mode mode);
-  virtual void Close() {
-    if (mdb_env_ != NULL) {
-      mdb_dbi_close(mdb_env_, mdb_dbi_);
-      mdb_env_close(mdb_env_);
-      mdb_env_ = NULL;
+class LMDB: public DB {
+  public:
+    LMDB()
+        : mdb_env_(NULL) {
+    }
+    virtual ~LMDB() {
+      Close();
+    }
+    virtual void Open(const string& source, Mode mode);
+    virtual void Close() {
+      if (mdb_env_ != NULL) {
+        mdb_dbi_close(mdb_env_, mdb_dbi_);
+        mdb_env_close(mdb_env_);
+        mdb_env_ = NULL;
+      }
     }
-  }
-  virtual LMDBCursor* NewCursor();
-  virtual LMDBTransaction* NewTransaction();
+    virtual LMDBCursor* NewCursor();
+    virtual LMDBTransaction* NewTransaction();
 
- private:
-  MDB_env* mdb_env_;
-  MDB_dbi mdb_dbi_;
+  private:
+    MDB_env* mdb_env_;
+    MDB_dbi mdb_dbi_;
 };
 
 }  // namespace db
diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp
index 6ea595db..bf5d7705 100644
--- a/include/caffe/util/device_alternate.hpp
+++ b/include/caffe/util/device_alternate.hpp
@@ -31,70 +31,11 @@ void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
 
 #else  // Normal GPU + CPU Caffe.
 
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <driver_types.h>  // cuda driver types
 #ifdef USE_CUDNN  // cuDNN acceleration library.
 #include "caffe/util/cudnn.hpp"
 #endif
 
-//
-// CUDA macros
-//
-
-// CUDA: various checks for different function calls.
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
-  } while (0)
-
-#define CUBLAS_CHECK(condition) \
-  do { \
-    cublasStatus_t status = condition; \
-    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
-      << caffe::cublasGetErrorString(status); \
-  } while (0)
-
-#define CURAND_CHECK(condition) \
-  do { \
-    curandStatus_t status = condition; \
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
-      << caffe::curandGetErrorString(status); \
-  } while (0)
-
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n); \
-       i += blockDim.x * gridDim.x)
-
-// CUDA: check for error after kernel execution and exit loudly if there is one.
-#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
-
 namespace caffe {
-
-// CUDA: library error reporting.
-const char* cublasGetErrorString(cublasStatus_t error);
-const char* curandGetErrorString(curandStatus_t error);
-
-// CUDA: thread number configuration.
-// Use 1024 threads per block, which requires cuda sm_2x or above,
-// or fall back to attempt compatibility (best of luck to you).
-#if __CUDA_ARCH__ >= 200
-    const int CAFFE_CUDA_NUM_THREADS = 1024;
-#else
-    const int CAFFE_CUDA_NUM_THREADS = 512;
-#endif
-
-// CUDA: number of blocks for threads.
-inline int CAFFE_GET_BLOCKS(const int N) {
-  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
-}
-
 }  // namespace caffe
 
 #endif  // CPU_ONLY
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index 0051e2fa..9c6de363 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -1,32 +1,74 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #ifndef _CAFFE_UTIL_IM2COL_HPP_
 #define _CAFFE_UTIL_IM2COL_HPP_
 
 namespace caffe {
 
 template <typename Dtype>
-void im2col_cpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_col);
+void im2col_cpu(const Dtype* data_im, const int channels, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_col);
+
+template <typename Dtype>
+void col2im_cpu(const Dtype* data_col, const int channels, const int height,
+    const int width, const int patch_h, const int patch_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_im);
 
+#ifndef CPU_ONLY
 template <typename Dtype>
-void col2im_cpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_im);
+void col2im_gpu(const Dtype* data_col, const int col_offset, const int height,
+    const int width, const int channels, const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    Dtype* data_im, const int img_offset);
 
 template <typename Dtype>
-void im2col_gpu(const Dtype* data_im, const int channels,
+void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_col);
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    Dtype* data_col, const int col_offset);
 
 template <typename Dtype>
-void col2im_gpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_im);
+void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
+    const int channels, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col, const int col_offset,
+    int optnum);
 
+template <typename Dtype>
+void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset,
+    const int channels, const int height, const int width, const int psize,
+    const int pad, const int stride, Dtype* data_im, const int img_offset);
+
+template <typename Dtype>
+void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
+    const int channels, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im, const int img_offset,
+    int optnum);
+#endif
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_IM2COL_HPP_
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index 3a62c3c9..c04cce6a 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -38,8 +38,8 @@ inline void MakeTempDir(string* temp_dirname) {
   // NOLINT_NEXT_LINE(runtime/printf)
   strcpy(temp_dirname_cstr, temp_dirname->c_str());
   char* mkdtemp_result = mkdtemp(temp_dirname_cstr);
-  CHECK(mkdtemp_result != NULL)
-      << "Failed to create a temporary directory at: " << *temp_dirname;
+  CHECK(mkdtemp_result != NULL) << "Failed to create a temporary directory at: "
+      << *temp_dirname;
   *temp_dirname = temp_dirname_cstr;
   delete[] temp_dirname_cstr;
 }
@@ -74,14 +74,13 @@ inline void ReadProtoFromBinaryFileOrDie(const char* filename, Message* proto) {
 }
 
 inline void ReadProtoFromBinaryFileOrDie(const string& filename,
-                                         Message* proto) {
+    Message* proto) {
   ReadProtoFromBinaryFileOrDie(filename.c_str(), proto);
 }
 
-
 void WriteProtoToBinaryFile(const Message& proto, const char* filename);
-inline void WriteProtoToBinaryFile(
-    const Message& proto, const string& filename) {
+inline void WriteProtoToBinaryFile(const Message& proto,
+    const string& filename) {
   WriteProtoToBinaryFile(proto, filename.c_str());
 }
 
@@ -91,14 +90,13 @@ inline bool ReadFileToDatum(const string& filename, Datum* datum) {
   return ReadFileToDatum(filename, -1, datum);
 }
 
-bool ReadImageToDatum(const string& filename, const int label,
-    const int height, const int width, const bool is_color,
-    const std::string & encoding, Datum* datum);
+bool ReadImageToDatum(const string& filename, const int label, const int height,
+    const int width, const bool is_color, const std::string & encoding,
+    Datum* datum);
 
 inline bool ReadImageToDatum(const string& filename, const int label,
     const int height, const int width, const bool is_color, Datum* datum) {
-  return ReadImageToDatum(filename, label, height, width, is_color,
-                          "", datum);
+  return ReadImageToDatum(filename, label, height, width, is_color, "", datum);
 }
 
 inline bool ReadImageToDatum(const string& filename, const int label,
@@ -124,14 +122,13 @@ inline bool ReadImageToDatum(const string& filename, const int label,
 bool DecodeDatumNative(Datum* datum);
 bool DecodeDatum(Datum* datum, bool is_color);
 
-cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width, const bool is_color);
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width, const bool is_color);
 
-cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width);
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width);
 
-cv::Mat ReadImageToCVMat(const string& filename,
-    const bool is_color);
+cv::Mat ReadImageToCVMat(const string& filename, const bool is_color);
 
 cv::Mat ReadImageToCVMat(const string& filename);
 
@@ -141,18 +138,16 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color);
 void CVMatToDatum(const cv::Mat& cv_img, Datum* datum);
 
 template <typename Dtype>
-void hdf5_load_nd_dataset_helper(
-    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
+void hdf5_load_nd_dataset_helper(hid_t file_id, const char* dataset_name_,
+    int min_dim, int max_dim, Blob<Dtype>* blob);
 
 template <typename Dtype>
-void hdf5_load_nd_dataset(
-    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
+void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, int min_dim,
+    int max_dim, Blob<Dtype>* blob);
 
 template <typename Dtype>
-void hdf5_save_nd_dataset(
-    const hid_t file_id, const string& dataset_name, const Blob<Dtype>& blob);
+void hdf5_save_nd_dataset(const hid_t file_id, const string& dataset_name,
+    const Blob<Dtype>& blob);
 
 }  // namespace caffe
 
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 2cacd8e7..4ca1fac0 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -1,24 +1,75 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
 #define CAFFE_UTIL_MATH_FUNCTIONS_H_
 
 #include <stdint.h>
 #include <cmath>  // for std::fabs and std::signbit
-
+#include <CL/cl.h>
+#include <clBLAS.h>
 #include "glog/logging.h"
 
-#include "caffe/common.hpp"
-#include "caffe/util/device_alternate.hpp"
 #include "caffe/util/mkl_alternate.hpp"
+#include "caffe/util/ocl_util.hpp"
 
 namespace caffe {
 
-// Caffe gemm provides a simpler interface to the gemm functions, with the
+// Decaf gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
 template <typename Dtype>
-void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA,
+void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+    const int M, const int N, const int K, const Dtype alpha, const Dtype* A,
+    const Dtype* B, const Dtype beta, Dtype* C);
+
+// Decaf gpu gemm provides an interface that is almost the same as the cpu
+// gemm function - following the c convention and calling the fortran-order
+// gpu code under the hood.
+template <typename Dtype>
+void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+    const int M, const int N, const int K, const Dtype alpha, const Dtype* A,
+    const Dtype* B, const Dtype beta, Dtype* C);
+
+template <typename Dtype>
+cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
+    const int offB, const Dtype beta, Dtype* C, const int offC);
+/*This is Yuan Gao's sgemm_ex*/
+template <typename Dtype>
+void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
     const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-    Dtype* C);
+    Dtype* C, const int offset1, const int offset2, const int offset3);
+
+template <typename Dtype>
+cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const int offA, const Dtype* B,
+    const int offB, const Dtype beta, Dtype* C, const int offC);
 
 template <typename Dtype>
 void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
@@ -26,29 +77,75 @@ void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
     Dtype* y);
 
 template <typename Dtype>
-void caffe_axpy(const int N, const Dtype alpha, const Dtype* X,
-    Dtype* Y);
+void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+    const Dtype alpha, const Dtype* A, size_t offA, int lda, const Dtype * x,
+    size_t offx, const Dtype beta, int incx, Dtype* y, size_t offy, int incy);
+
+template <typename Dtype>
+void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+    Dtype* y);
+
+template <typename Dtype>
+void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y);
+
+template <typename Dtype>
+void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y);
 
 template <typename Dtype>
 void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X,
     const Dtype beta, Dtype* Y);
 
+template <typename Dtype>
+void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X,
+    const Dtype beta, Dtype* Y);
+
 template <typename Dtype>
 void caffe_copy(const int N, const Dtype *X, Dtype *Y);
 
 template <typename Dtype>
 void caffe_set(const int N, const Dtype alpha, Dtype *X);
 
+template <typename Dtype>
+void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X, const int offset=0);
+
 inline void caffe_memset(const size_t N, const int alpha, void* X) {
   memset(X, alpha, N);  // NOLINT(caffe/alt_fn)
 }
 
+inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
+#ifndef CPU_ONLY
+  ocl_memset((int*) X, (alpha << 24) | (alpha << 16) | (alpha << 8) | alpha, N);
+#endif
+}
+
+void caffe_gpu_memcpy(const size_t N, const void *X, void *Y);
+
+template <typename Dtype>
+void caffe_gpu_memcpy(const size_t N, const Dtype *X, Dtype *Y);
+
+template <typename Dtype>
+void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y);
+
+template <typename Dtype>
+void caffe_gpu_copy(const int N, const Dtype* X, const int offx, Dtype* Y, const int offy);
+
 template <typename Dtype>
 void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X);
 
+template <typename Dtype>
+void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha,
+    Dtype *X);
+
 template <typename Dtype>
 void caffe_scal(const int N, const Dtype alpha, Dtype *X);
 
+template <typename Dtype>
+void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X, const int offx = 0);
+
 template <typename Dtype>
 void caffe_sqr(const int N, const Dtype* a, Dtype* y);
 
@@ -61,12 +158,27 @@ void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 template <typename Dtype>
 void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
+template <typename Dtype>
+void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
 template <typename Dtype>
 void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
+//CUDA version, need to be deleted
+template <typename Dtype>
+void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a,
+    const Dtype* b, Dtype* y);
+
 template <typename Dtype>
 void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
+//CUDA version, need to be deleted
+template <typename Dtype>
+void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
+
 unsigned int caffe_rng_rand();
 
 template <typename Dtype>
@@ -75,9 +187,25 @@ Dtype caffe_nextafter(const Dtype b);
 template <typename Dtype>
 void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
 
+// caffe_gpu_rng_uniform with two arguments generates integers in the range
+// [0, UINT_MAX].
+void caffe_gpu_rng_uniform(const int n, unsigned int* r);
+
+// caffe_gpu_rng_uniform with four arguments generates floats in the range
+// (a, b] (strictly greater than a, less than or equal to b) due to the
+// specification of curandGenerateUniform.  With a = 0, b = 1, just calls
+// curandGenerateUniform; with other limits will shift and scale the outputs
+// appropriately after calling curandGenerateUniform.
+template <typename Dtype>
+void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
+
 template <typename Dtype>
 void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
-                        Dtype* r);
+    Dtype* r);
+
+template <typename Dtype>
+void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
+    Dtype* r);
 
 template <typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, int* r);
@@ -86,32 +214,41 @@ template <typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r);
 
 template <typename Dtype>
-void caffe_exp(const int n, const Dtype* a, Dtype* y);
+void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r);
 
 template <typename Dtype>
-void caffe_log(const int n, const Dtype* a, Dtype* y);
+void caffe_exp(const int n, const Dtype* a, Dtype* y);
 
 template <typename Dtype>
-void caffe_abs(const int n, const Dtype* a, Dtype* y);
+Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y);
 
 template <typename Dtype>
-Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y);
+void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
 
 template <typename Dtype>
-Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx,
-    const Dtype* y, const int incy);
+void caffe_gpu_dot(const int n, const Dtype* x, size_t offx, const Dtype* y, size_t offy, Dtype* out);
 
 template <typename Dtype>
 int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y);
 
+template <typename Dtype>
+uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x,
+    const Dtype* y);
+
 // Returns the sum of the absolute values of the elements of vector x
 template <typename Dtype>
 Dtype caffe_cpu_asum(const int n, const Dtype* x);
 
+template <typename Dtype>
+void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_asum(const int n, const Dtype* x, size_t offx, Dtype* y);
+
 // the branchless, type-safe version from
 // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c
-template<typename Dtype>
-inline int8_t caffe_sign(Dtype val) {
+template <typename Dtype>
+inline char caffe_sign(Dtype val) {
   return (Dtype(0) < val) - (val < Dtype(0));
 }
 
@@ -130,63 +267,54 @@ inline int8_t caffe_sign(Dtype val) {
     } \
   }
 
-// output is 1 for the positives, 0 for zero, and -1 for the negatives
-DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
-
-// This returns a nonzero value if the input has its sign bit set.
-// The name sngbit is meant to avoid conflicts with std::signbit in the macro.
-// The extra parens are needed because CUDA < 6.5 defines signbit as a macro,
-// and we don't want that to expand here when CUDA headers are also included.
-DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \
-    y[i] = static_cast<bool>((std::signbit)(x[i])));
-
-DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
+#define INSTANTIATE_CAFFE_CPU_UNARY_FUNC(name) \
+  template <> \
+  void caffe_cpu_##name<float>(const int n, const float* x, float* y); \
+  template <> \
+  void caffe_cpu_##name<double>(const int n, const double* x, double* y)
 
-template <typename Dtype>
-void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
+#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \
+template<typename Dtype> \
+void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
+    operation; \
+} \
+template <> \
+void caffe_gpu_##name<float>(const int n, const float* x, float* y) { \
+} \
+template <> \
+void caffe_gpu_##name<double>(const int n, const double* x, double* y) { \
+}
 
-#ifndef CPU_ONLY  // GPU
+// output is 1 for the positives, 0 for zero, and -1 for the negatives
+DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
 
-// Decaf gpu gemm provides an interface that is almost the same as the cpu
-// gemm function - following the c convention and calling the fortran-order
-// gpu code under the hood.
 template <typename Dtype>
-void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
-    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
-    Dtype* C);
+void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
-    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
-    Dtype* y);
+void caffe_gpu_sign(const int N, const Dtype *X, const int offx, Dtype *Y, const int offy);
 
-template <typename Dtype>
-void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
-    Dtype* Y);
+// This returns a nonzero value if the input has its sign bit set.
+// The name sngbit is meant to avoid conflicts with std::signbit in the macro
+using std::signbit;
+DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, y[i] = signbit(x[i]));
 
 template <typename Dtype>
-void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X,
-    const Dtype beta, Dtype* Y);
+void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y);
 
-void caffe_gpu_memcpy(const size_t N, const void *X, void *Y);
+DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
 
 template <typename Dtype>
-void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
+void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
 
-inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
-#ifndef CPU_ONLY
-  CUDA_CHECK(cudaMemset(X, alpha, N));  // NOLINT(caffe/alt_fn)
-#else
-  NO_GPU;
-#endif
-}
+template <typename Dtype>
+void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
+void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
+void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, const int offx, Dtype* y, const int offy);
 
 template <typename Dtype>
 void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
@@ -212,69 +340,18 @@ void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
 template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
-// caffe_gpu_rng_uniform with two arguments generates integers in the range
-// [0, UINT_MAX].
-void caffe_gpu_rng_uniform(const int n, unsigned int* r);
-
-// caffe_gpu_rng_uniform with four arguments generates floats in the range
-// (a, b] (strictly greater than a, less than or equal to b) due to the
-// specification of curandGenerateUniform.  With a = 0, b = 1, just calls
-// curandGenerateUniform; with other limits will shift and scale the outputs
-// appropriately after calling curandGenerateUniform.
-template <typename Dtype>
-void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
-
 template <typename Dtype>
-void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
-                            Dtype* r);
-
-template <typename Dtype>
-void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r);
-
-template <typename Dtype>
-void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
-
-template <typename Dtype>
-uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x,
-                                    const Dtype* y);
+void caffe_exp(const int n, const Dtype* a, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
-
-template<typename Dtype>
-void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
-
-template<typename Dtype>
-void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y);
+void caffe_abs(const int n, const Dtype* a, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
+void caffe_log(const int n, const Dtype* a, Dtype* y);
 
 template <typename Dtype>
-void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
-
-#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \
-template<typename Dtype> \
-__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
-  CUDA_KERNEL_LOOP(index, n) { \
-    operation; \
-  } \
-} \
-template <> \
-void caffe_gpu_##name<float>(const int n, const float* x, float* y) { \
-  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
-  name##_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
-      n, x, y); \
-} \
-template <> \
-void caffe_gpu_##name<double>(const int n, const double* x, double* y) { \
-  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
-  name##_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
-      n, x, y); \
-}
-
-#endif  // !CPU_ONLY
-
+Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx,
+    const Dtype* y, const int incy);
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_MATH_FUNCTIONS_H_
diff --git a/include/caffe/util/math_functions.hpp.protect b/include/caffe/util/math_functions.hpp.protect
new file mode 100644
index 00000000..2cacd8e7
--- /dev/null
+++ b/include/caffe/util/math_functions.hpp.protect
@@ -0,0 +1,280 @@
+#ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
+#define CAFFE_UTIL_MATH_FUNCTIONS_H_
+
+#include <stdint.h>
+#include <cmath>  // for std::fabs and std::signbit
+
+#include "glog/logging.h"
+
+#include "caffe/common.hpp"
+#include "caffe/util/device_alternate.hpp"
+#include "caffe/util/mkl_alternate.hpp"
+
+namespace caffe {
+
+// Caffe gemm provides a simpler interface to the gemm functions, with the
+// limitation that the data has to be contiguous in memory.
+template <typename Dtype>
+void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+    Dtype* C);
+
+template <typename Dtype>
+void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+    Dtype* y);
+
+template <typename Dtype>
+void caffe_axpy(const int N, const Dtype alpha, const Dtype* X,
+    Dtype* Y);
+
+template <typename Dtype>
+void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X,
+    const Dtype beta, Dtype* Y);
+
+template <typename Dtype>
+void caffe_copy(const int N, const Dtype *X, Dtype *Y);
+
+template <typename Dtype>
+void caffe_set(const int N, const Dtype alpha, Dtype *X);
+
+inline void caffe_memset(const size_t N, const int alpha, void* X) {
+  memset(X, alpha, N);  // NOLINT(caffe/alt_fn)
+}
+
+template <typename Dtype>
+void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_scal(const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_sqr(const int N, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
+
+unsigned int caffe_rng_rand();
+
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b);
+
+template <typename Dtype>
+void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
+
+template <typename Dtype>
+void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
+                        Dtype* r);
+
+template <typename Dtype>
+void caffe_rng_bernoulli(const int n, const Dtype p, int* r);
+
+template <typename Dtype>
+void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r);
+
+template <typename Dtype>
+void caffe_exp(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_log(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_abs(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y);
+
+template <typename Dtype>
+Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx,
+    const Dtype* y, const int incy);
+
+template <typename Dtype>
+int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y);
+
+// Returns the sum of the absolute values of the elements of vector x
+template <typename Dtype>
+Dtype caffe_cpu_asum(const int n, const Dtype* x);
+
+// the branchless, type-safe version from
+// http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c
+template<typename Dtype>
+inline int8_t caffe_sign(Dtype val) {
+  return (Dtype(0) < val) - (val < Dtype(0));
+}
+
+// The following two macros are modifications of DEFINE_VSL_UNARY_FUNC
+//   in include/caffe/util/mkl_alternate.hpp authored by @Rowland Depp.
+// Please refer to commit 7e8ef25c7 of the boost-eigen branch.
+// Git cherry picking that commit caused a conflict hard to resolve and
+//   copying that file in convenient for code reviewing.
+// So they have to be pasted here temporarily.
+#define DEFINE_CAFFE_CPU_UNARY_FUNC(name, operation) \
+  template<typename Dtype> \
+  void caffe_cpu_##name(const int n, const Dtype* x, Dtype* y) { \
+    CHECK_GT(n, 0); CHECK(x); CHECK(y); \
+    for (int i = 0; i < n; ++i) { \
+      operation; \
+    } \
+  }
+
+// output is 1 for the positives, 0 for zero, and -1 for the negatives
+DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
+
+// This returns a nonzero value if the input has its sign bit set.
+// The name sngbit is meant to avoid conflicts with std::signbit in the macro.
+// The extra parens are needed because CUDA < 6.5 defines signbit as a macro,
+// and we don't want that to expand here when CUDA headers are also included.
+DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \
+    y[i] = static_cast<bool>((std::signbit)(x[i])));
+
+DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
+
+template <typename Dtype>
+void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
+
+#ifndef CPU_ONLY  // GPU
+
+// Decaf gpu gemm provides an interface that is almost the same as the cpu
+// gemm function - following the c convention and calling the fortran-order
+// gpu code under the hood.
+template <typename Dtype>
+void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
+    Dtype* C);
+
+template <typename Dtype>
+void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+    const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
+    Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
+    Dtype* Y);
+
+template <typename Dtype>
+void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X,
+    const Dtype beta, Dtype* Y);
+
+void caffe_gpu_memcpy(const size_t N, const void *X, void *Y);
+
+template <typename Dtype>
+void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
+
+inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
+#ifndef CPU_ONLY
+  CUDA_CHECK(cudaMemset(X, alpha, N));  // NOLINT(caffe/alt_fn)
+#else
+  NO_GPU;
+#endif
+}
+
+template <typename Dtype>
+void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
+
+template <typename Dtype>
+void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_abs(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_exp(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
+
+// caffe_gpu_rng_uniform with two arguments generates integers in the range
+// [0, UINT_MAX].
+void caffe_gpu_rng_uniform(const int n, unsigned int* r);
+
+// caffe_gpu_rng_uniform with four arguments generates floats in the range
+// (a, b] (strictly greater than a, less than or equal to b) due to the
+// specification of curandGenerateUniform.  With a = 0, b = 1, just calls
+// curandGenerateUniform; with other limits will shift and scale the outputs
+// appropriately after calling curandGenerateUniform.
+template <typename Dtype>
+void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r);
+
+template <typename Dtype>
+void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma,
+                            Dtype* r);
+
+template <typename Dtype>
+void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r);
+
+template <typename Dtype>
+void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out);
+
+template <typename Dtype>
+uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x,
+                                    const Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y);
+
+template<typename Dtype>
+void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y);
+
+template<typename Dtype>
+void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
+
+#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \
+template<typename Dtype> \
+__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \
+  CUDA_KERNEL_LOOP(index, n) { \
+    operation; \
+  } \
+} \
+template <> \
+void caffe_gpu_##name<float>(const int n, const float* x, float* y) { \
+  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
+  name##_kernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
+      n, x, y); \
+} \
+template <> \
+void caffe_gpu_##name<double>(const int n, const double* x, double* y) { \
+  /* NOLINT_NEXT_LINE(whitespace/operators) */ \
+  name##_kernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>( \
+      n, x, y); \
+}
+
+#endif  // !CPU_ONLY
+
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_MATH_FUNCTIONS_H_
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 3355b665..2ca24374 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -81,14 +81,12 @@ DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
 // in standard blas. We will simply use a two-step (inefficient, of course) way
 // to mimic that.
 inline void cblas_saxpby(const int N, const float alpha, const float* X,
-                         const int incX, const float beta, float* Y,
-                         const int incY) {
+    const int incX, const float beta, float* Y, const int incY) {
   cblas_sscal(N, beta, Y, incY);
   cblas_saxpy(N, alpha, X, incX, Y, incY);
 }
 inline void cblas_daxpby(const int N, const double alpha, const double* X,
-                         const int incX, const double beta, double* Y,
-                         const int incY) {
+    const int incX, const double beta, double* Y, const int incY) {
   cblas_dscal(N, beta, Y, incY);
   cblas_daxpy(N, alpha, X, incX, Y, incY);
 }
diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp
new file mode 100644
index 00000000..3027019f
--- /dev/null
+++ b/include/caffe/util/ocl_util.hpp
@@ -0,0 +1,41 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+#ifndef _CAFFE_UTIL_OCL_UTIL_HPP_
+#define _CAFFE_UTIL_OCL_UTIL_HPP_
+
+namespace caffe {
+#ifndef CPU_ONLY
+template <typename Dtype>
+void ocl_memset(Dtype* buffer, const Dtype value, const int count, const int buf_offset = 0);
+
+void ocl_memset(cl_mem buffer, const int value, const int count);
+
+void eventCallback(cl_event event, cl_int event_status, void * user_data);
+#endif
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_OCL_UTIL_HPP_
diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp
new file mode 100644
index 00000000..0ce3a184
--- /dev/null
+++ b/include/caffe/util/ocl_wrapper.hpp
@@ -0,0 +1,358 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+#ifndef _CAFFE_UTIL_OCL_WRAPPER_HPP_
+#define _CAFFE_UTIL_OCL_WRAPPER_HPP_
+
+namespace caffe {
+
+typedef unsigned int uint32_t;
+
+template <typename dtype> inline std::string get_dtype_suffix() {
+  dtype x;
+  const char type = typeid(x).name()[0];
+  std::string suffix;
+  switch (type) {
+  case 'i':
+    suffix = "_int";
+    break;
+  case 'd':
+    suffix = "_double";
+    break;
+  case 'f':
+  default:
+    suffix = "_float";
+  }
+  return suffix;
+}
+
+#ifndef CPU_ONLY
+template <typename Dtype>
+void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_,
+    const int M_, const int packing_num);
+
+template <typename Dtype>
+void opttrans(const Dtype* data_im, const int im_offset, const int channels,
+    const int height, const int width, Dtype* data_opt, const int opt_offset,
+    const int optnum);
+
+template <typename Dtype>
+void get_max_gpu(cl_kernel Kernel, const int num, const int dim,
+    const Dtype* bottom_data, Dtype* scale_data);
+
+template <typename Dtype>
+void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out);
+
+template <typename Dtype>
+void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim,
+    const Dtype* scale, Dtype* data);
+
+template <typename Dtype>
+Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim,
+    const Dtype* prob_data, const Dtype* label, cl_mem d_loss);
+
+template <typename Dtype>
+void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data);
+
+template <typename Dtype>
+void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data,
+    const Dtype* label);
+
+template <typename Dtype>
+void max_pool_fp_gpu(cl_kernel Kernel, const int count,
+    const Dtype* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    Dtype* top_data);
+
+template <typename Dtype>
+void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum,
+    const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
+    Dtype* top_mask);
+
+template <typename Dtype>
+void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
+    const int* const mask, const Dtype* const top_mask, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, Dtype* const bottom_diff);
+
+template <typename Dtype>
+void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
+    const int num, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, Dtype* const bottom_diff);
+
+template <typename Dtype>
+void StoPoolBackward(const int nthreads, const Dtype* const rand_idx,
+    const Dtype* const top_diff, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, Dtype* const bottom_diff);
+template <typename Dtype>
+void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data);
+
+template <typename Dtype>
+void SigmoidBackward(const int count, const Dtype* top_diff,
+    const Dtype* top_data, Dtype* bottom_diff);
+
+template <typename Dtype>
+void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data);
+
+template <typename Dtype>
+void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data,
+    Dtype* bottom_diff);
+
+template <typename Dtype>
+void ThresholdForward(const int count, const Dtype threshold,
+    const Dtype* bottom_data, Dtype* top_data);
+
+template <typename Dtype>
+void ave_pool_fp_gpu(cl_kernel Kernel, const int count,
+    const Dtype* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, Dtype* top_data);
+
+template <typename Dtype>
+void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum,
+    const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, Dtype* top_data);
+
+template <typename Dtype>
+void StoPoolForwardTrain(const int count, const Dtype* bottom_data,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    Dtype* idx_data, Dtype* top_data);
+
+template <typename Dtype>
+void StoPoolForwardTest(const int count, const Dtype* bottom_data,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    Dtype* top_data);
+
+template <typename Dtype>
+void max_pool_bp_gpu(cl_kernel Kernel, const int count,
+    const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_size_,
+    const int stride_, Dtype* bottom_diff);
+
+template <typename Dtype>
+void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff,
+    const int clnum, const int channels_, const int intheight_,
+    const int width_, const int pooled_height_, const int pooled_width_,
+    const int kernel_size_, const int stride_, const int pad_,
+    Dtype* bottom_diff);
+
+template <typename Dtype>
+void PReLUForward(const int count, const int channels, const int dim,
+    const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
+    const int div_factor);
+
+template <typename Dtype>
+void PReLUBackward(const int count, const int channels, const int dim,
+    const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
+    const Dtype* slope_data, const int div_factor);
+
+template <typename Dtype>
+void PReLUParamBackward(const int count, const Dtype* top_diff,
+    const int offset_out, const Dtype* bottom_data, const int offset_in,
+    Dtype* bottom_diff);
+
+template <typename Dtype>
+void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data,
+    Dtype negative_slope);
+
+template <typename Dtype>
+void ReLUBackward(const int count, const Dtype* top_diff,
+    const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope);
+
+template <typename Dtype>
+void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void DropoutForward(const int count, const Dtype* bottom_data,
+		const unsigned int* MaskMem, const unsigned int threshold, const float scale_, Dtype *top_data);
+
+template <typename Dtype>
+void DropoutBackward(const int count, const Dtype* top_diff, const unsigned int* MaskMem,
+		const unsigned int threshold_, const float scale_, Dtype* bottom_diff);
+
+template <typename Dtype>
+void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup,
+    Dtype threshold);
+
+void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed = 0);
+
+template <typename Dtype>
+void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup, unsigned int  _seed = 0);
+
+template <typename Dtype>
+void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V);
+
+template <typename Dtype>
+void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y);
+
+template <typename Dtype>
+void caffe_gpu_signbit(const int N, const Dtype* X, Dtype * Y);
+
+template <typename Dtype>
+void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y);
+
+template <typename Dtype>
+void caffe_gpu_sign_with_offset_ocl(const int N, const Dtype* X, const int offx,  Dtype * Y, const int offy);
+
+template <typename Dtype>
+void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, const Dtype* data, Dtype* out);
+
+template <typename Dtype>
+void kernel_channel_subtract(const int count, const int num, const int channels,
+    const int spatial_dim, const Dtype* channel_max, Dtype* data);
+
+template <typename Dtype>
+void kernel_powx(const int count, const Dtype* data, const Dtype alpha,
+    Dtype* out);
+
+template <typename Dtype>
+void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out);
+
+template <typename Dtype>
+void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out);
+
+template <typename Dtype>
+void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out);
+
+template <typename Dtype>
+void kernel_log(const int count, const Dtype* data, Dtype* out);
+
+template <typename Dtype>
+void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out);
+
+template <typename Dtype>
+void kernel_add_scalar(const int count, const Dtype data, Dtype* out);
+
+template <typename Dtype>
+void kernel_exp(const int count, const Dtype* data, Dtype* out);
+
+template <typename Dtype>
+void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, const Dtype* data, Dtype* channel_sum);
+
+template <typename Dtype>
+void kernel_channel_div(const int count, const int num, const int channels,
+    const int spatial_dim, const Dtype* channel_sum, Dtype* data);
+
+template <typename Dtype>
+void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
+    Dtype* channel_dot);
+
+template <typename Dtype>
+void SoftmaxLossForwardGPU(const int nthreads, const Dtype* prob_data,
+    const Dtype* label, Dtype* loss, const int num, const int dim,
+    const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, Dtype* counts);
+
+template <typename Dtype>
+void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
+    const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+    const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, Dtype* counts);
+
+template <typename Dtype>
+void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data);
+
+template <typename Dtype>
+void LRNFillScale(const int nthreads, const Dtype* const in, const int num,
+    const int channels, const int height, const int width, const int size,
+    const Dtype alpha_over_size, const Dtype k, Dtype* const scale);
+
+template <typename Dtype>
+void LRNComputeOutput(int nthreads, const Dtype* in, Dtype* scale,
+    Dtype negative_beta, Dtype* out);
+
+template <typename Dtype>
+void LRNComputeDiff(const int nthreads, const Dtype* const bottom_data,
+    const Dtype* const top_data, const Dtype* const scale,
+    const Dtype* const top_diff, const int num, const int channels,
+    const int height, const int width, const int size,
+    const Dtype negative_beta, const Dtype cache_ratio,
+    Dtype* const bottom_diff);
+template <typename Dtype>
+void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y);
+
+template <typename Dtype>
+void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y);
+
+template <typename Dtype>
+void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data);
+
+template <typename Dtype>
+void BNLLBackward(const int count, const Dtype* top_diff,
+    const Dtype* bottom_data, Dtype *bottom_diff);
+
+template <typename Dtype>
+void Concat(const int nthreads, const Dtype* in_data, const bool forward,
+    const int num_concats, const int concat_size, const int top_concat_axis,
+    const int bottom_concat_axis, const int offset_concat_axis,
+    Dtype *out_data);
+
+template <typename Dtype>
+void CLLBackward(const int count, const int channels, const Dtype margin,
+    const bool legacy_version, const Dtype alpha, const Dtype* y,
+    const Dtype* diff, const Dtype* dist_sq, Dtype *bottom_diff);
+
+template <typename Dtype>
+void MaxForward(const int nthreads, const Dtype* bottom_data_a,
+    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, int* mask);
+
+template <typename Dtype>
+void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx,
+    const int* mask, Dtype* bottom_diff);
+
+template <typename Dtype>
+void Slice(const int nthreads, const Dtype* in_data,
+    const bool forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, Dtype* out_data);
+#endif
+}
+#endif  // CAFFE_UTIL_OCL_UTIL_HPP_
+// namespace caffe
diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp
index 8f1cf0d1..febd932d 100644
--- a/include/caffe/util/rng.hpp
+++ b/include/caffe/util/rng.hpp
@@ -20,13 +20,13 @@ inline rng_t* caffe_rng() {
 // Fisher–Yates algorithm
 template <class RandomAccessIterator, class RandomGenerator>
 inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end,
-                    RandomGenerator* gen) {
-  typedef typename std::iterator_traits<RandomAccessIterator>::difference_type
-      difference_type;
+    RandomGenerator* gen) {
+  typedef typename std::iterator_traits<RandomAccessIterator>::difference_type difference_type;
   typedef typename boost::uniform_int<difference_type> dist_type;
 
   difference_type length = std::distance(begin, end);
-  if (length <= 0) return;
+  if (length <= 0)
+    return;
 
   for (difference_type i = length - 1; i > 0; --i) {
     dist_type dist(0, i);
diff --git a/include/caffe/util/upgrade_proto.hpp b/include/caffe/util/upgrade_proto.hpp
index c1f21a0d..496ba1e0 100644
--- a/include/caffe/util/upgrade_proto.hpp
+++ b/include/caffe/util/upgrade_proto.hpp
@@ -23,11 +23,11 @@ bool UpgradeV0Net(const NetParameter& v0_net_param, NetParameter* net_param);
 // taking its top blob as input.
 // Error if any of these above layers are not-conv layers.
 void UpgradeV0PaddingLayers(const NetParameter& param,
-                            NetParameter* param_upgraded_pad);
+    NetParameter* param_upgraded_pad);
 
 // Upgrade a single V0LayerConnection to the V1LayerParameter format.
 bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
-                             V1LayerParameter* layer_param);
+    V1LayerParameter* layer_param);
 
 V1LayerParameter_LayerType UpgradeV0LayerType(const string& type);
 
@@ -46,7 +46,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param);
 bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param);
 
 bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
-                             LayerParameter* layer_param);
+    LayerParameter* layer_param);
 
 const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type);
 
@@ -55,9 +55,9 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param);
 
 // Read parameters from a file into a NetParameter proto message.
 void ReadNetParamsFromTextFileOrDie(const string& param_file,
-                                    NetParameter* param);
+    NetParameter* param);
 void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
-                                      NetParameter* param);
+    NetParameter* param);
 
 }  // namespace caffe
 
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index a6bd86a9..381b983b 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -21,94 +21,149 @@ namespace caffe {
  *        ConvolutionLayer and DeconvolutionLayer.
  */
 template <typename Dtype>
-class BaseConvolutionLayer : public Layer<Dtype> {
- public:
-  explicit BaseConvolutionLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline int MinBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  virtual inline bool EqualNumBottomTopBlobs() const { return true; }
-
- protected:
-  // Helper functions that abstract away the column buffer and gemm arguments.
-  // The last argument in forward_cpu_gemm is so that we can skip the im2col if
-  // we just called weight_cpu_gemm with the same input.
-  void forward_cpu_gemm(const Dtype* input, const Dtype* weights,
-      Dtype* output, bool skip_im2col = false);
-  void forward_cpu_bias(Dtype* output, const Dtype* bias);
-  void backward_cpu_gemm(const Dtype* input, const Dtype* weights,
-      Dtype* output);
-  void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype*
-      weights);
-  void backward_cpu_bias(Dtype* bias, const Dtype* input);
+class BaseConvolutionLayer: public Layer<Dtype> {
+  public:
+    explicit BaseConvolutionLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual ~BaseConvolutionLayer();
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline int MinBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+    virtual inline bool EqualNumBottomTopBlobs() const {
+      return true;
+    }
+
+  protected:
+    // Helper functions that abstract away the column buffer and gemm arguments.
+    // The last argument in forward_cpu_gemm is so that we can skip the im2col if
+    // we just called weight_cpu_gemm with the same input.
+    void forward_cpu_gemm(const Dtype* input, const Dtype* weights,
+        Dtype* output, bool skip_im2col = false);
+    void forward_cpu_bias(Dtype* output, const Dtype* bias);
+    void backward_cpu_gemm(const Dtype* input, const Dtype* weights,
+        Dtype* output);
+    void weight_cpu_gemm(const Dtype* input, const Dtype* output,
+        Dtype* weights);
+    void backward_cpu_bias(Dtype* bias, const Dtype* input);
+    //opencl related setup
+    void ocl_setup();
 
 #ifndef CPU_ONLY
-  void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
-      Dtype* output, bool skip_im2col = false);
-  void forward_gpu_bias(Dtype* output, const Dtype* bias);
-  void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
-      Dtype* col_output);
-  void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype*
-      weights);
-  void backward_gpu_bias(Dtype* bias, const Dtype* input);
+    void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
+        Dtype* output, bool skip_im2col = false);
+    void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights,
+        Dtype* output, bool skip_im2col = false);
+    void forward_gpu_bias(Dtype* output, const Dtype* bias);
+    void forward_gpu_bias_opt(Dtype* output, const Dtype* bias);
+    void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
+        Dtype* col_output);
+    void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights,
+        Dtype* col_output);
+    void weight_gpu_gemm(const Dtype* col_input, const Dtype* output,
+        Dtype* weights);
+    void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output,
+        Dtype* weights);
+    void backward_gpu_bias(Dtype* bias, const Dtype* input);
 #endif
 
-  // reverse_dimensions should return true iff we are implementing deconv, so
-  // that conv helpers know which dimensions are which.
-  virtual bool reverse_dimensions() = 0;
-  // Compute height_out_ and width_out_ from other parameters.
-  virtual void compute_output_shape() = 0;
-
-  int kernel_h_, kernel_w_;
-  int stride_h_, stride_w_;
-  int num_;
-  int channels_;
-  int pad_h_, pad_w_;
-  int height_, width_;
-  int group_;
-  int num_output_;
-  int height_out_, width_out_;
-  bool bias_term_;
-  bool is_1x1_;
-
- private:
-  // wrap im2col/col2im so we don't have to remember the (long) argument lists
-  inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
-    im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
-  }
-  inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
-    col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
-  }
+    // reverse_dimensions should return true iff we are implementing deconv, so
+    // that conv helpers know which dimensions are which.
+    virtual bool reverse_dimensions() = 0;
+    // Compute height_out_ and width_out_ from other parameters.
+    virtual void compute_output_shape() = 0;
+
+    int kernel_h_, kernel_w_;
+    int stride_h_, stride_w_;
+    int num_;
+    int channels_;
+    int pad_h_, pad_w_;
+    int height_, width_;
+    int group_;
+    int num_output_;
+    int height_out_, width_out_;
+    bool bias_term_;
+    bool is_1x1_;
+
+  private:
+    // wrap im2col/col2im so we don't have to remember the (long) argument lists
+    inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
+      im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
+          kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
+    }
+    inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
+      col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
+          kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
+    }
 #ifndef CPU_ONLY
-  inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
-    im2col_gpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
-  }
-  inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
-    col2im_gpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
-  }
+    inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
+      im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_,
+          conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+          stride_w_, col_buff, 0);
+    }
+    inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
+      col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_,
+          conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+          stride_w_, data, bottom_offset_);
+    }
+  protected:
+    inline void conv_im2col_gpu_opt(const Dtype* data) {
+      im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_,
+          conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+          stride_w_, (Dtype*) transMem, 0,
+          opt_num2);
+    }
+    inline void conv_col2im_gpu_opt(Dtype* data) {
+      col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_,
+          conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+          stride_w_, data, bottom_offset_,
+          opt_num2);
+    }
+  private:
+    inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) {
+      transform_gpu((Dtype*) temp_buffer, top_data, top_offset_, N_,
+          M_ * opt_num2, opt_num2);
+    }
+    inline void conv_transpose_gpu(const Dtype* data) {
+      opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
+          opt_num2);
+    }
+  protected:
+    inline void gpu_memset(Dtype* data, Dtype value, int count) {
+      ocl_memset(data, value, count);
+    }
 #endif
 
-  int conv_out_channels_;
-  int conv_in_channels_;
-  int conv_out_spatial_dim_;
-  int conv_in_height_;
-  int conv_in_width_;
-  int kernel_dim_;
-  int weight_offset_;
-  int col_offset_;
-  int output_offset_;
-
-  Blob<Dtype> col_buffer_;
-  Blob<Dtype> bias_multiplier_;
+  private:
+    int conv_out_channels_;
+    int conv_in_channels_;
+    int conv_out_spatial_dim_;
+    int conv_in_height_;
+    int conv_in_width_;
+    int kernel_dim_;
+
+    Blob<Dtype> col_buffer_;
+    Blob<Dtype> bias_multiplier_;
+
+//opencl related data structures
+  protected:
+    int opt_num2;
+    int M_, N_, K_;
+    int weight_offset_;
+    int col_offset_;
+    int output_offset_;
+    int top_offset_, top_offset_opt, bottom_offset_;
+  public:
+    static cl_mem subTopMem, transMem;
+    static size_t subtop_mem_size, trans_mem_size;
 };
 
 /**
@@ -128,52 +183,67 @@ class BaseConvolutionLayer : public Layer<Dtype> {
  *   the output channel N' columns of the output matrix.
  */
 template <typename Dtype>
-class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
- public:
-  /**
-   * @param param provides ConvolutionParameter convolution_param,
-   *    with ConvolutionLayer options:
-   *  - num_output. The number of filters.
-   *  - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
-   *  kernel_size for square filters or kernel_h and kernel_w for rectangular
-   *  filters.
-   *  - stride / stride_h / stride_w (\b optional, default 1). The filter
-   *  stride, given by stride_size for equal dimensions or stride_h and stride_w
-   *  for different strides. By default the convolution is dense with stride 1.
-   *  - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
-   *  convolution, given by pad for equal dimensions or pad_h and pad_w for
-   *  different padding. Input padding is computed implicitly instead of
-   *  actually padding.
-   *  - group (\b optional, default 1). The number of filter groups. Group
-   *  convolution is a method for reducing parameterization by selectively
-   *  connecting input and output channels. The input and output channel dimensions must be divisible
-   *  by the number of groups. For group @f$ \geq 1 @f$, the
-   *  convolutional filters' input and output channels are separated s.t. each
-   *  group takes 1 / group of the input channels and makes 1 / group of the
-   *  output channels. Concretely 4 input channels, 8 output channels, and
-   *  2 groups separate input channels 1-2 and output channels 1-4 into the
-   *  first group and input channels 3-4 and output channels 5-8 into the second
-   *  group.
-   *  - bias_term (\b optional, default true). Whether to have a bias.
-   *  - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
-   *    kernels + stream parallelism) engines.
-   */
-  explicit ConvolutionLayer(const LayerParameter& param)
-      : BaseConvolutionLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "Convolution"; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual inline bool reverse_dimensions() { return false; }
-  virtual void compute_output_shape();
+class ConvolutionLayer: public BaseConvolutionLayer<Dtype> {
+  public:
+    /**
+     * @param param provides ConvolutionParameter convolution_param,
+     *    with ConvolutionLayer options:
+     *  - num_output. The number of filters.
+     *  - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
+     *  kernel_size for square filters or kernel_h and kernel_w for rectangular
+     *  filters.
+     *  - stride / stride_h / stride_w (\b optional, default 1). The filter
+     *  stride, given by stride_size for equal dimensions or stride_h and stride_w
+     *  for different strides. By default the convolution is dense with stride 1.
+     *  - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
+     *  convolution, given by pad for equal dimensions or pad_h and pad_w for
+     *  different padding. Input padding is computed implicitly instead of
+     *  actually padding.
+     *  - group (\b optional, default 1). The number of filter groups. Group
+     *  convolution is a method for reducing parameterization by selectively
+     *  connecting input and output channels. The input and output channel dimensions must be divisible
+     *  by the number of groups. For group @f$ \geq 1 @f$, the
+     *  convolutional filters' input and output channels are separated s.t. each
+     *  group takes 1 / group of the input channels and makes 1 / group of the
+     *  output channels. Concretely 4 input channels, 8 output channels, and
+     *  2 groups separate input channels 1-2 and output channels 1-4 into the
+     *  first group and input channels 3-4 and output channels 5-8 into the second
+     *  group.
+     *  - bias_term (\b optional, default true). Whether to have a bias.
+     *  - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
+     *    kernels + stream parallelism) engines.
+     */
+    explicit ConvolutionLayer(const LayerParameter& param)
+        : BaseConvolutionLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "Convolution";
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual inline bool reverse_dimensions() {
+      return false;
+    }
+    virtual void compute_output_shape();
+#ifndef CPU_ONLY
+    virtual void Forward_gpu_org(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_gpu_org(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Forward_gpu_batched(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_gpu_batched(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+#endif
 };
 
 /**
@@ -191,24 +261,29 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
  *   stride results in upsampling rather than downsampling).
  */
 template <typename Dtype>
-class DeconvolutionLayer : public BaseConvolutionLayer<Dtype> {
- public:
-  explicit DeconvolutionLayer(const LayerParameter& param)
-      : BaseConvolutionLayer<Dtype>(param) {}
-
-  virtual inline const char* type() const { return "Deconvolution"; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual inline bool reverse_dimensions() { return true; }
-  virtual void compute_output_shape();
+class DeconvolutionLayer: public BaseConvolutionLayer<Dtype> {
+  public:
+    explicit DeconvolutionLayer(const LayerParameter& param)
+        : BaseConvolutionLayer<Dtype>(param) {
+    }
+
+    virtual inline const char* type() const {
+      return "Deconvolution";
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual inline bool reverse_dimensions() {
+      return true;
+    }
+    virtual void compute_output_shape();
 };
 
 #ifdef USE_CUDNN
@@ -225,19 +300,19 @@ class DeconvolutionLayer : public BaseConvolutionLayer<Dtype> {
  * input and filter regimes the CUDNN engine is faster than the CAFFE engine,
  * but for fully-convolutional models and large inputs the CAFFE engine can be
  * faster as long as it fits in memory.
-*/
+ */
 template <typename Dtype>
 class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
- public:
+  public:
   explicit CuDNNConvolutionLayer(const LayerParameter& param)
-      : ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
+  : ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual ~CuDNNConvolutionLayer();
 
- protected:
+  protected:
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
@@ -245,10 +320,10 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
 
   bool handles_setup_;
   cudnnHandle_t* handle_;
-  cudaStream_t*  stream_;
+  cudaStream_t* stream_;
   vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
-  cudnnTensorDescriptor_t    bias_desc_;
-  cudnnFilterDescriptor_t      filter_desc_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
   vector<cudnnConvolutionDescriptor_t> conv_descs_;
   int bottom_offset_, top_offset_, weight_offset_, bias_offset_;
   size_t workspaceSizeInBytes;
@@ -264,34 +339,41 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
 template <typename Dtype>
-class Im2colLayer : public Layer<Dtype> {
- public:
-  explicit Im2colLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Im2col"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int kernel_h_, kernel_w_;
-  int stride_h_, stride_w_;
-  int channels_;
-  int height_, width_;
-  int pad_h_, pad_w_;
+class Im2colLayer: public Layer<Dtype> {
+  public:
+    explicit Im2colLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Im2col";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int kernel_h_, kernel_w_;
+    int stride_h_, stride_w_;
+    int channels_;
+    int height_, width_;
+    int pad_h_, pad_w_;
 };
 
 // Forward declare PoolingLayer and SplitLayer for use in LRNLayer.
@@ -304,152 +386,168 @@ template <typename Dtype> class SplitLayer;
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
 template <typename Dtype>
-class LRNLayer : public Layer<Dtype> {
- public:
-  explicit LRNLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "LRN"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+class LRNLayer: public Layer<Dtype> {
+  public:
+    explicit LRNLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "LRN";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int ExactNumTopBlobs() const {
+      return 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int size_;
+    int pre_pad_;
+    Dtype alpha_;
+    Dtype beta_;
+    Dtype k_;
+    int num_;
+    int channels_;
+    int height_;
+    int width_;
+
+    // Fields used for normalization ACROSS_CHANNELS
+    // scale_ stores the intermediate summing results
+    Blob<Dtype> scale_;
+
+    // Fields used for normalization WITHIN_CHANNEL
+    shared_ptr<SplitLayer<Dtype> > split_layer_;
+    vector<Blob<Dtype>*> split_top_vec_;
+    shared_ptr<PowerLayer<Dtype> > square_layer_;
+    Blob<Dtype> square_input_;
+    Blob<Dtype> square_output_;
+    vector<Blob<Dtype>*> square_bottom_vec_;
+    vector<Blob<Dtype>*> square_top_vec_;
+    shared_ptr<PoolingLayer<Dtype> > pool_layer_;
+    Blob<Dtype> pool_output_;
+    vector<Blob<Dtype>*> pool_top_vec_;
+    shared_ptr<PowerLayer<Dtype> > power_layer_;
+    Blob<Dtype> power_output_;
+    vector<Blob<Dtype>*> power_top_vec_;
+    shared_ptr<EltwiseLayer<Dtype> > product_layer_;
+    Blob<Dtype> product_input_;
+    vector<Blob<Dtype>*> product_bottom_vec_;
 
-  virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-
-  int size_;
-  int pre_pad_;
-  Dtype alpha_;
-  Dtype beta_;
-  Dtype k_;
-  int num_;
-  int channels_;
-  int height_;
-  int width_;
-
-  // Fields used for normalization ACROSS_CHANNELS
-  // scale_ stores the intermediate summing results
-  Blob<Dtype> scale_;
-
-  // Fields used for normalization WITHIN_CHANNEL
-  shared_ptr<SplitLayer<Dtype> > split_layer_;
-  vector<Blob<Dtype>*> split_top_vec_;
-  shared_ptr<PowerLayer<Dtype> > square_layer_;
-  Blob<Dtype> square_input_;
-  Blob<Dtype> square_output_;
-  vector<Blob<Dtype>*> square_bottom_vec_;
-  vector<Blob<Dtype>*> square_top_vec_;
-  shared_ptr<PoolingLayer<Dtype> > pool_layer_;
-  Blob<Dtype> pool_output_;
-  vector<Blob<Dtype>*> pool_top_vec_;
-  shared_ptr<PowerLayer<Dtype> > power_layer_;
-  Blob<Dtype> power_output_;
-  vector<Blob<Dtype>*> power_top_vec_;
-  shared_ptr<EltwiseLayer<Dtype> > product_layer_;
-  Blob<Dtype> product_input_;
-  vector<Blob<Dtype>*> product_bottom_vec_;
 };
 
-
-/**
+/*n
  * @brief Pools the input image by taking the max, average, etc. within regions.
  *
  * TODO(dox): thorough documentation for Forward, Backward, and proto params.
  */
 template <typename Dtype>
-class PoolingLayer : public Layer<Dtype> {
- public:
-  explicit PoolingLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "Pooling"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  // MAX POOL layers can output an extra top blob for the mask;
-  // others can only output the pooled inputs.
-  virtual inline int MaxTopBlobs() const {
-    return (this->layer_param_.pooling_param().pool() ==
-            PoolingParameter_PoolMethod_MAX) ? 2 : 1;
-  }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+class PoolingLayer: public Layer<Dtype> {
+  public:
+    explicit PoolingLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "Pooling";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+    // MAX POOL layers can output an extra top blob for the mask;
+    // others can only output the pooled inputs.
+    virtual inline int MaxTopBlobs() const {
+      return
+          (this->layer_param_.pooling_param().pool()
+              == PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int kernel_h_, kernel_w_;
+    int stride_h_, stride_w_;
+    int pad_h_, pad_w_;
+    int channels_;
+    int height_, width_;
+    int pooled_height_, pooled_width_;
+    bool global_pooling_;
+    Blob<Dtype> rand_idx_;
+    Blob<int> max_idx_;
 
-  int kernel_h_, kernel_w_;
-  int stride_h_, stride_w_;
-  int pad_h_, pad_w_;
-  int channels_;
-  int height_, width_;
-  int pooled_height_, pooled_width_;
-  bool global_pooling_;
-  Blob<Dtype> rand_idx_;
-  Blob<int> max_idx_;
 };
 
 #ifdef USE_CUDNN
 /*
  * @brief cuDNN implementation of PoolingLayer.
  *        Fallback to PoolingLayer for CPU mode.
-*/
+ */
 template <typename Dtype>
 class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
- public:
+  public:
   explicit CuDNNPoolingLayer(const LayerParameter& param)
-      : PoolingLayer<Dtype>(param), handles_setup_(false) {}
+  : PoolingLayer<Dtype>(param), handles_setup_(false) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual ~CuDNNPoolingLayer();
   // Currently, cuDNN does not support the extra top blob.
-  virtual inline int MinTopBlobs() const { return -1; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
+  virtual inline int MinTopBlobs() const {return -1;}
+  virtual inline int ExactNumTopBlobs() const {return 1;}
 
- protected:
+  protected:
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool handles_setup_;
-  cudnnHandle_t             handle_;
+  cudnnHandle_t handle_;
   cudnnTensorDescriptor_t bottom_desc_, top_desc_;
-  cudnnPoolingDescriptor_t  pooling_desc_;
-  cudnnPoolingMode_t        mode_;
+  cudnnPoolingDescriptor_t pooling_desc_;
+  cudnnPoolingMode_t mode_;
 };
 #endif
 
@@ -460,63 +558,71 @@ class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
  *        images are of the same size.
  */
 template <typename Dtype>
-class SPPLayer : public Layer<Dtype> {
- public:
-  explicit SPPLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-  virtual inline const char* type() const { return "SPP"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  // MAX POOL layers can output an extra top blob for the mask;
-  // others can only output the pooled inputs.
-  virtual inline int MaxTopBlobs() const {
-    return (this->layer_param_.pooling_param().pool() ==
-            PoolingParameter_PoolMethod_MAX) ? 2 : 1;
-  }
-
- protected:
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
-  // calculates the kernel and stride dimensions for the pooling layer,
-  // returns a correctly configured LayerParameter for a PoolingLayer
-  virtual LayerParameter GetPoolingParam(const int pyramid_level,
-      const int bottom_h, const int bottom_w, const SPPParameter spp_param);
-
-  int pyramid_height_;
-  int bottom_h_, bottom_w_;
-  int channels_;
-  int kernel_h_, kernel_w_;
-  int pad_h_, pad_w_;
-
-  /// the internal Split layer that feeds the pooling layers
-  shared_ptr<SplitLayer<Dtype> > split_layer_;
-  /// top vector holder used in call to the underlying SplitLayer::Forward
-  vector<Blob<Dtype>*> split_top_vec_;
-  /// bottom vector holder used in call to the underlying PoolingLayer::Forward
-  vector<vector<Blob<Dtype>*>*> pooling_bottom_vecs_;
-  /// the internal Pooling layers of different kernel sizes
-  vector<shared_ptr<PoolingLayer<Dtype> > > pooling_layers_;
-  /// top vector holders used in call to the underlying PoolingLayer::Forward
-  vector<vector<Blob<Dtype>*>*> pooling_top_vecs_;
-  /// pooling_outputs stores the outputs of the PoolingLayers
-  vector<Blob<Dtype>*> pooling_outputs_;
-  /// the internal Flatten layers that the Pooling layers feed into
-  vector<FlattenLayer<Dtype>*> flatten_layers_;
-  /// top vector holders used in call to the underlying FlattenLayer::Forward
-  vector<vector<Blob<Dtype>*>*> flatten_top_vecs_;
-  /// flatten_outputs stores the outputs of the FlattenLayers
-  vector<Blob<Dtype>*> flatten_outputs_;
-  /// bottom vector holder used in call to the underlying ConcatLayer::Forward
-  vector<Blob<Dtype>*> concat_bottom_vec_;
-  /// the internal Concat layers that the Flatten layers feed into
-  shared_ptr<ConcatLayer<Dtype> > concat_layer_;
+class SPPLayer: public Layer<Dtype> {
+  public:
+    explicit SPPLayer(const LayerParameter& param)
+        : Layer<Dtype>(param) {
+    }
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const {
+      return "SPP";
+    }
+    virtual inline int ExactNumBottomBlobs() const {
+      return 1;
+    }
+    virtual inline int MinTopBlobs() const {
+      return 1;
+    }
+    // MAX POOL layers can output an extra top blob for the mask;
+    // others can only output the pooled inputs.
+    virtual inline int MaxTopBlobs() const {
+      return
+          (this->layer_param_.pooling_param().pool()
+              == PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+    }
+
+  protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+        const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+        const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    // calculates the kernel and stride dimensions for the pooling layer,
+    // returns a correctly configured LayerParameter for a PoolingLayer
+    virtual LayerParameter GetPoolingParam(const int pyramid_level,
+        const int bottom_h, const int bottom_w, const SPPParameter spp_param);
+
+    int pyramid_height_;
+    int bottom_h_, bottom_w_;
+    int channels_;
+    int kernel_h_, kernel_w_;
+    int pad_h_, pad_w_;
+
+    /// the internal Split layer that feeds the pooling layers
+    shared_ptr<SplitLayer<Dtype> > split_layer_;
+    /// top vector holder used in call to the underlying SplitLayer::Forward
+    vector<Blob<Dtype>*> split_top_vec_;
+    /// bottom vector holder used in call to the underlying PoolingLayer::Forward
+    vector<vector<Blob<Dtype>*>*> pooling_bottom_vecs_;
+    /// the internal Pooling layers of different kernel sizes
+    vector<shared_ptr<PoolingLayer<Dtype> > > pooling_layers_;
+    /// top vector holders used in call to the underlying PoolingLayer::Forward
+    vector<vector<Blob<Dtype>*>*> pooling_top_vecs_;
+    /// pooling_outputs stores the outputs of the PoolingLayers
+    vector<Blob<Dtype>*> pooling_outputs_;
+    /// the internal Flatten layers that the Pooling layers feed into
+    vector<FlattenLayer<Dtype>*> flatten_layers_;
+    /// top vector holders used in call to the underlying FlattenLayer::Forward
+    vector<vector<Blob<Dtype>*>*> flatten_top_vecs_;
+    /// flatten_outputs stores the outputs of the FlattenLayers
+    vector<Blob<Dtype>*> flatten_outputs_;
+    /// bottom vector holder used in call to the underlying ConcatLayer::Forward
+    vector<Blob<Dtype>*> concat_bottom_vec_;
+    /// the internal Concat layers that the Flatten layers feed into
+    shared_ptr<ConcatLayer<Dtype> > concat_layer_;
 };
 
 }  // namespace caffe
diff --git a/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt b/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt
new file mode 100644
index 00000000..37b1d0d3
--- /dev/null
+++ b/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt
@@ -0,0 +1,14 @@
+net: "models/bvlc_alexnet/train_val_without_dropout.prototxt"
+test_iter: 1
+test_interval: 1000
+base_lr: 0.01
+lr_policy: "step"
+gamma: 0.1
+stepsize: 100000
+display: 1
+max_iter: 450000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 10000
+snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train"
+solver_mode: GPU
diff --git a/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt b/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt
new file mode 100644
index 00000000..f269ca0d
--- /dev/null
+++ b/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt
@@ -0,0 +1,366 @@
+name: "AlexNet"
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
+    batch_size: 256
+    backend: LMDB
+  }
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mirror: false
+    crop_size: 227
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
+  }
+  data_param {
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "fc8"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/models/bvlc_alexnet/solver.prototxt b/models/bvlc_alexnet/solver.prototxt
index 129265e6..6f23e9d1 100644
--- a/models/bvlc_alexnet/solver.prototxt
+++ b/models/bvlc_alexnet/solver.prototxt
@@ -1,11 +1,11 @@
 net: "models/bvlc_alexnet/train_val.prototxt"
-test_iter: 1000
+test_iter: 1
 test_interval: 1000
 base_lr: 0.01
 lr_policy: "step"
 gamma: 0.1
 stepsize: 100000
-display: 20
+display: 1
 max_iter: 450000
 momentum: 0.9
 weight_decay: 0.0005
diff --git a/models/bvlc_alexnet/train_val.prototxt b/models/bvlc_alexnet/train_val.prototxt
index 588b4ea7..1f9654be 100644
--- a/models/bvlc_alexnet/train_val.prototxt
+++ b/models/bvlc_alexnet/train_val.prototxt
@@ -10,10 +10,10 @@ layer {
   transform_param {
     mirror: true
     crop_size: 227
-    mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
   }
   data_param {
-    source: "examples/imagenet/ilsvrc12_train_lmdb"
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb"
     batch_size: 256
     backend: LMDB
   }
@@ -29,10 +29,10 @@ layer {
   transform_param {
     mirror: false
     crop_size: 227
-    mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
+    mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto"
   }
   data_param {
-    source: "examples/imagenet/ilsvrc12_val_lmdb"
+    source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb"
     batch_size: 50
     backend: LMDB
   }
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index 40e6c11f..3e675c20 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -32,5 +32,3 @@ install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib)
 file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
 list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)
 install(PROGRAMS ${proto_python} DESTINATION python/caffe/proto)
-
-
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 94fdcc35..ece07d14 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -55,22 +55,20 @@ void Blob<Dtype>::ReshapeLike(const Blob<Dtype>& other) {
 template <typename Dtype>
 Blob<Dtype>::Blob(const int num, const int channels, const int height,
     const int width)
-  // capacity_ must be initialized before calling Reshape
-  : capacity_(0) {
+    : capacity_(0) {
   Reshape(num, channels, height, width);
 }
 
 template <typename Dtype>
 Blob<Dtype>::Blob(const vector<int>& shape)
-  // capacity_ must be initialized before calling Reshape
-  : capacity_(0) {
+    : capacity_(0) {
   Reshape(shape);
 }
 
 template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_data() const {
-  CHECK(data_);
-  return (const Dtype*)data_->cpu_data();
+  CHECK (data_);
+  return (const Dtype*) data_->cpu_data();
 }
 
 template <typename Dtype>
@@ -81,43 +79,49 @@ void Blob<Dtype>::set_cpu_data(Dtype* data) {
 
 template <typename Dtype>
 const Dtype* Blob<Dtype>::gpu_data() const {
-  CHECK(data_);
-  return (const Dtype*)data_->gpu_data();
+  CHECK (data_);
+  return (const Dtype*) data_->gpu_data();
+}
+
+template <typename Dtype>
+const Dtype* Blob<Dtype>::gpu_cache_data() const {
+  CHECK (data_);
+  return (const Dtype*) data_->gpu_cache_data();
 }
 
 template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_diff() const {
-  CHECK(diff_);
-  return (const Dtype*)diff_->cpu_data();
+  CHECK (diff_);
+  return (const Dtype*) diff_->cpu_data();
 }
 
 template <typename Dtype>
 const Dtype* Blob<Dtype>::gpu_diff() const {
-  CHECK(diff_);
-  return (const Dtype*)diff_->gpu_data();
+  CHECK (diff_);
+  return (const Dtype*) diff_->gpu_data();
 }
 
 template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_cpu_data() {
-  CHECK(data_);
+  CHECK (data_);
   return static_cast<Dtype*>(data_->mutable_cpu_data());
 }
 
 template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_gpu_data() {
-  CHECK(data_);
+  CHECK (data_);
   return static_cast<Dtype*>(data_->mutable_gpu_data());
 }
 
 template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_cpu_diff() {
-  CHECK(diff_);
+  CHECK (diff_);
   return static_cast<Dtype*>(diff_->mutable_cpu_data());
 }
 
 template <typename Dtype>
 Dtype* Blob<Dtype>::mutable_gpu_diff() {
-  CHECK(diff_);
+  CHECK (diff_);
   return static_cast<Dtype*>(diff_->mutable_gpu_data());
 }
 
@@ -136,8 +140,12 @@ void Blob<Dtype>::ShareDiff(const Blob& other) {
 // The "update" method is used for parameter blobs in a Net, which are stored
 // as Blob<float> or Blob<double> -- hence we do not define it for
 // Blob<int> or Blob<unsigned int>.
-template <> void Blob<unsigned int>::Update() { NOT_IMPLEMENTED; }
-template <> void Blob<int>::Update() { NOT_IMPLEMENTED; }
+template <> void Blob<unsigned int>::Update() {
+  NOT_IMPLEMENTED;
+}
+template <> void Blob<int>::Update() {
+  NOT_IMPLEMENTED;
+}
 
 template <typename Dtype>
 void Blob<Dtype>::Update() {
@@ -145,17 +153,15 @@ void Blob<Dtype>::Update() {
   switch (data_->head()) {
   case SyncedMemory::HEAD_AT_CPU:
     // perform computation on CPU
-    caffe_axpy<Dtype>(count_, Dtype(-1),
-        static_cast<const Dtype*>(diff_->cpu_data()),
-        static_cast<Dtype*>(data_->mutable_cpu_data()));
+    caffe_axpy < Dtype
+        > (count_, Dtype(-1), static_cast<const Dtype*>(diff_->cpu_data()), static_cast<Dtype*>(data_->mutable_cpu_data()));
     break;
   case SyncedMemory::HEAD_AT_GPU:
   case SyncedMemory::SYNCED:
 #ifndef CPU_ONLY
     // perform computation on GPU
-    caffe_gpu_axpy<Dtype>(count_, Dtype(-1),
-        static_cast<const Dtype*>(diff_->gpu_data()),
-        static_cast<Dtype*>(data_->mutable_gpu_data()));
+    caffe_gpu_axpy < Dtype
+        > (count_, Dtype(-1), static_cast<const Dtype*>(diff_->gpu_data()), static_cast<Dtype*>(data_->mutable_gpu_data()));
 #else
     NO_GPU;
 #endif
@@ -177,7 +183,9 @@ template <> int Blob<int>::asum_data() const {
 
 template <typename Dtype>
 Dtype Blob<Dtype>::asum_data() const {
-  if (!data_) { return 0; }
+  if (!data_) {
+    return 0;
+  }
   switch (data_->head()) {
   case SyncedMemory::HEAD_AT_CPU:
     return caffe_cpu_asum(count_, cpu_data());
@@ -212,7 +220,9 @@ template <> int Blob<int>::asum_diff() const {
 
 template <typename Dtype>
 Dtype Blob<Dtype>::asum_diff() const {
-  if (!diff_) { return 0; }
+  if (!diff_) {
+    return 0;
+  }
   switch (diff_->head()) {
   case SyncedMemory::HEAD_AT_CPU:
     return caffe_cpu_asum(count_, cpu_diff());
@@ -249,7 +259,9 @@ template <typename Dtype>
 Dtype Blob<Dtype>::sumsq_data() const {
   Dtype sumsq;
   const Dtype* data;
-  if (!data_) { return 0; }
+  if (!data_) {
+    return 0;
+  }
   switch (data_->head()) {
   case SyncedMemory::HEAD_AT_CPU:
     data = cpu_data();
@@ -286,7 +298,9 @@ template <typename Dtype>
 Dtype Blob<Dtype>::sumsq_diff() const {
   Dtype sumsq;
   const Dtype* diff;
-  if (!diff_) { return 0; }
+  if (!diff_) {
+    return 0;
+  }
   switch (diff_->head()) {
   case SyncedMemory::HEAD_AT_CPU:
     diff = cpu_diff();
@@ -320,7 +334,9 @@ template <> void Blob<int>::scale_data(int scale_factor) {
 template <typename Dtype>
 void Blob<Dtype>::scale_data(Dtype scale_factor) {
   Dtype* data;
-  if (!data_) { return; }
+  if (!data_) {
+    return;
+  }
   switch (data_->head()) {
   case SyncedMemory::HEAD_AT_CPU:
     data = mutable_cpu_data();
@@ -353,7 +369,9 @@ template <> void Blob<int>::scale_diff(int scale_factor) {
 template <typename Dtype>
 void Blob<Dtype>::scale_diff(Dtype scale_factor) {
   Dtype* diff;
-  if (!diff_) { return; }
+  if (!diff_) {
+    return;
+  }
   switch (diff_->head()) {
   case SyncedMemory::HEAD_AT_CPU:
     diff = mutable_cpu_diff();
@@ -377,19 +395,17 @@ void Blob<Dtype>::scale_diff(Dtype scale_factor) {
 
 template <typename Dtype>
 bool Blob<Dtype>::ShapeEquals(const BlobProto& other) {
-  if (other.has_num() || other.has_channels() ||
-      other.has_height() || other.has_width()) {
+  if (other.has_num() || other.has_channels() || other.has_height()
+      || other.has_width()) {
     // Using deprecated 4D Blob dimensions --
     // shape is (num, channels, height, width).
     // Note: we do not use the normal Blob::num(), Blob::channels(), etc.
     // methods as these index from the beginning of the blob shape, where legacy
     // parameter blobs were indexed from the end of the blob shape (e.g., bias
     // Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)).
-    return shape_.size() <= 4 &&
-           LegacyShape(-4) == other.num() &&
-           LegacyShape(-3) == other.channels() &&
-           LegacyShape(-2) == other.height() &&
-           LegacyShape(-1) == other.width();
+    return shape_.size() <= 4 && LegacyShape(-4) == other.num()
+        && LegacyShape(-3) == other.channels()
+        && LegacyShape(-2) == other.height() && LegacyShape(-1) == other.width();
   }
   vector<int> other_shape(other.shape().dim_size());
   for (int i = 0; i < other.shape().dim_size(); ++i) {
@@ -410,10 +426,10 @@ void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
   switch (Caffe::mode()) {
   case Caffe::GPU:
     if (copy_diff) {
-      caffe_copy(count_, source.gpu_diff(),
+      caffe_gpu_copy(count_, source.gpu_diff(),
           static_cast<Dtype*>(diff_->mutable_gpu_data()));
     } else {
-      caffe_copy(count_, source.gpu_data(),
+      caffe_gpu_copy(count_, source.gpu_data(),
           static_cast<Dtype*>(data_->mutable_gpu_data()));
     }
     break;
@@ -435,8 +451,8 @@ template <typename Dtype>
 void Blob<Dtype>::FromProto(const BlobProto& proto, bool reshape) {
   if (reshape) {
     vector<int> shape;
-    if (proto.has_num() || proto.has_channels() ||
-        proto.has_height() || proto.has_width()) {
+    if (proto.has_num() || proto.has_channels() || proto.has_height()
+        || proto.has_width()) {
       // Using deprecated 4D Blob dimensions --
       // shape is (num, channels, height, width).
       shape.resize(4);
@@ -487,9 +503,9 @@ void Blob<Dtype>::ToProto(BlobProto* proto, bool write_diff) const {
   }
 }
 
-INSTANTIATE_CLASS(Blob);
-template class Blob<int>;
-template class Blob<unsigned int>;
+INSTANTIATE_CLASS (Blob);
+template class Blob<int> ;
+template class Blob<unsigned int> ;
 
 }  // namespace caffe
 
diff --git a/src/caffe/cmake_install.cmake b/src/caffe/cmake_install.cmake
new file mode 100644
index 00000000..f98ef538
--- /dev/null
+++ b/src/caffe/cmake_install.cmake
@@ -0,0 +1,79 @@
+# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe
+
+# Set the install prefix
+IF(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install")
+ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX)
+STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  IF(BUILD_TYPE)
+    STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  ELSE(BUILD_TYPE)
+    SET(CMAKE_INSTALL_CONFIG_NAME "Release")
+  ENDIF(BUILD_TYPE)
+  MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+
+# Set the component getting installed.
+IF(NOT CMAKE_INSTALL_COMPONENT)
+  IF(COMPONENT)
+    MESSAGE(STATUS "Install component: \"${COMPONENT}\"")
+    SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  ELSE(COMPONENT)
+    SET(CMAKE_INSTALL_COMPONENT)
+  ENDIF(COMPONENT)
+ENDIF(NOT CMAKE_INSTALL_COMPONENT)
+
+# Install shared libraries without execute permission?
+IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  SET(CMAKE_INSTALL_SO_NO_EXE "1")
+ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+
+IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+  FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include" TYPE DIRECTORY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe")
+ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+
+IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+  FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/caffe/proto" TYPE FILE FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h")
+ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+
+IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+  IF(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so")
+    FILE(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so"
+         RPATH "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install/lib:/usr/local/cuda/lib64:/usr/local/lib")
+  ENDIF()
+  FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE SHARED_LIBRARY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib/libcaffe.so")
+  IF(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so")
+    FILE(RPATH_CHANGE
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so"
+         OLD_RPATH "/usr/local/cuda/lib64:/usr/local/lib:::::::::::::::::::::::::::::::::::::::::::::::::::::::::"
+         NEW_RPATH "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install/lib:/usr/local/cuda/lib64:/usr/local/lib")
+    IF(CMAKE_INSTALL_DO_STRIP)
+      EXECUTE_PROCESS(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so")
+    ENDIF(CMAKE_INSTALL_DO_STRIP)
+  ENDIF()
+ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+
+IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+  FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE STATIC_LIBRARY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib/libproto.a")
+ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+
+IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+  FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/python/caffe/proto" TYPE PROGRAM FILES
+    "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe_pb2.py"
+    "/home/yugao/caffe-merge-junli/caffe-yb/caffe/__init__.py"
+    )
+ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+
+IF(NOT CMAKE_INSTALL_LOCAL_ONLY)
+  # Include the install script for each subdirectory.
+  INCLUDE("/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/cmake_install.cmake")
+
+ENDIF(NOT CMAKE_INSTALL_LOCAL_ONLY)
+
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index af96cac4..9ed4207a 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -11,25 +11,29 @@ shared_ptr<Caffe> Caffe::singleton_;
 
 // random seeding
 int64_t cluster_seedgen(void) {
-  int64_t s, seed, pid;
-  FILE* f = fopen("/dev/urandom", "rb");
-  if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
-    fclose(f);
-    return seed;
-  }
-
-  LOG(INFO) << "System entropy source not available, "
-              "using fallback algorithm to generate seed instead.";
-  if (f)
-    fclose(f);
-
-  pid = getpid();
-  s = time(NULL);
-  seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729);
-  return seed;
+  //To fix: for now we use fixed seed to get same result each time
+  /*
+   int64_t s, seed, pid;
+   FILE* f = fopen("/dev/urandom", "rb");
+   if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
+   fclose(f);
+   return seed;
+   }
+
+   LOG(INFO) << "System entropy source not available, "
+   "using fallback algorithm to generate seed instead.";
+   if (f)
+   fclose(f);
+
+   pid = getpid();
+   s = time(NULL);
+   seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729);
+   //return seed;
+   LOG(WARNING) << "return fixed seed 37";
+   */
+  return 37;
 }
 
-
 void GlobalInit(int* pargc, char*** pargv) {
   // Google flags.
   ::gflags::ParseCommandLineFlags(pargc, pargv, true);
@@ -42,9 +46,11 @@ void GlobalInit(int* pargc, char*** pargv) {
 #ifdef CPU_ONLY  // CPU-only Caffe.
 
 Caffe::Caffe()
-    : random_generator_(), mode_(Caffe::CPU) { }
+: random_generator_(), mode_(Caffe::CPU) {
+}
 
-Caffe::~Caffe() { }
+Caffe::~Caffe() {
+}
 
 void Caffe::set_random_seed(const unsigned int seed) {
   // RNG seed
@@ -59,19 +65,18 @@ void Caffe::DeviceQuery() {
   NO_GPU;
 }
 
-
 class Caffe::RNG::Generator {
- public:
+  public:
   Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
   explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {}
-  caffe::rng_t* rng() { return rng_.get(); }
- private:
+  caffe::rng_t* rng() {return rng_.get();}
+  private:
   shared_ptr<caffe::rng_t> rng_;
 };
 
-Caffe::RNG::RNG() : generator_(new Generator()) { }
+Caffe::RNG::RNG() : generator_(new Generator()) {}
 
-Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { }
+Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) {}
 
 Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
   generator_ = other.generator_;
@@ -84,116 +89,58 @@ void* Caffe::RNG::generator() {
 
 #else  // Normal GPU + CPU Caffe.
 
-Caffe::Caffe()
-    : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
-    mode_(Caffe::CPU) {
-  // Try to create a cublas handler, and report an error if failed (but we will
-  // keep the program running as one might just want to run CPU code).
-  if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
-    LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available.";
-  }
-  // Try to create a curand handler.
-  if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)
-      != CURAND_STATUS_SUCCESS ||
-      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())
-      != CURAND_STATUS_SUCCESS) {
-    LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
+Caffe::Caffe() {
+  amdDevice.Init();
+  cl_int err = clblasSetup();
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "clBLAS setup failed " << err;
   }
 }
 
 Caffe::~Caffe() {
-  if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-  if (curand_generator_) {
-    CURAND_CHECK(curandDestroyGenerator(curand_generator_));
-  }
+  clblasTeardown();
 }
 
 void Caffe::set_random_seed(const unsigned int seed) {
-  // Curand seed
-  static bool g_curand_availability_logged = false;
-  if (Get().curand_generator_) {
-    CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator(),
-        seed));
-    CURAND_CHECK(curandSetGeneratorOffset(curand_generator(), 0));
-  } else {
-    if (!g_curand_availability_logged) {
-        LOG(ERROR) <<
-            "Curand not available. Skipping setting the curand seed.";
-        g_curand_availability_logged = true;
-    }
-  }
-  // RNG seed
-  Get().random_generator_.reset(new RNG(seed));
+	// RNG seed
+	Get().random_generator_.reset(new RNG(seed));
+        caffe_gpu_uniform(0, NULL, seed);
+        caffe_gpu_uniform((float*)NULL, 0, (float)0.0, (float)1.0, seed);
 }
 
 void Caffe::SetDevice(const int device_id) {
-  int current_device;
-  CUDA_CHECK(cudaGetDevice(&current_device));
-  if (current_device == device_id) {
+  if (amdDevice.GetDevice() == device_id) {
     return;
   }
-  // The call to cudaSetDevice must come before any calls to Get, which
-  // may perform initialization using the GPU.
-  CUDA_CHECK(cudaSetDevice(device_id));
-  if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
-  if (Get().curand_generator_) {
-    CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
-  }
-  CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
-  CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
-      CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
-      cluster_seedgen()));
+  amdDevice.Init(device_id);
 }
 
 void Caffe::DeviceQuery() {
-  cudaDeviceProp prop;
-  int device;
-  if (cudaSuccess != cudaGetDevice(&device)) {
-    printf("No cuda device present.\n");
-    return;
-  }
-  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-  LOG(INFO) << "Device id:                     " << device;
-  LOG(INFO) << "Major revision number:         " << prop.major;
-  LOG(INFO) << "Minor revision number:         " << prop.minor;
-  LOG(INFO) << "Name:                          " << prop.name;
-  LOG(INFO) << "Total global memory:           " << prop.totalGlobalMem;
-  LOG(INFO) << "Total shared memory per block: " << prop.sharedMemPerBlock;
-  LOG(INFO) << "Total registers per block:     " << prop.regsPerBlock;
-  LOG(INFO) << "Warp size:                     " << prop.warpSize;
-  LOG(INFO) << "Maximum memory pitch:          " << prop.memPitch;
-  LOG(INFO) << "Maximum threads per block:     " << prop.maxThreadsPerBlock;
-  LOG(INFO) << "Maximum dimension of block:    "
-      << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
-      << prop.maxThreadsDim[2];
-  LOG(INFO) << "Maximum dimension of grid:     "
-      << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
-      << prop.maxGridSize[2];
-  LOG(INFO) << "Clock rate:                    " << prop.clockRate;
-  LOG(INFO) << "Total constant memory:         " << prop.totalConstMem;
-  LOG(INFO) << "Texture alignment:             " << prop.textureAlignment;
-  LOG(INFO) << "Concurrent copy and execution: "
-      << (prop.deviceOverlap ? "Yes" : "No");
-  LOG(INFO) << "Number of multiprocessors:     " << prop.multiProcessorCount;
-  LOG(INFO) << "Kernel execution timeout:      "
-      << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
-  return;
+  amdDevice.DeviceQuery();
 }
 
-
 class Caffe::RNG::Generator {
- public:
-  Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {}
-  explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {}
-  caffe::rng_t* rng() { return rng_.get(); }
- private:
-  shared_ptr<caffe::rng_t> rng_;
+  public:
+    Generator()
+        : rng_(new caffe::rng_t(cluster_seedgen())) {
+    }
+    explicit Generator(unsigned int seed)
+        : rng_(new caffe::rng_t(seed)) {
+    }
+    caffe::rng_t* rng() {
+      return rng_.get();
+    }
+  private:
+    shared_ptr<caffe::rng_t> rng_;
 };
 
-Caffe::RNG::RNG() : generator_(new Generator()) { }
+Caffe::RNG::RNG()
+    : generator_(new Generator()) {
+}
 
-Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { }
+Caffe::RNG::RNG(unsigned int seed)
+    : generator_(new Generator(seed)) {
+}
 
 Caffe::RNG& Caffe::RNG::operator=(const RNG& other) {
   generator_.reset(other.generator_.get());
@@ -204,68 +151,6 @@ void* Caffe::RNG::generator() {
   return static_cast<void*>(generator_->rng());
 }
 
-const char* cublasGetErrorString(cublasStatus_t error) {
-  switch (error) {
-  case CUBLAS_STATUS_SUCCESS:
-    return "CUBLAS_STATUS_SUCCESS";
-  case CUBLAS_STATUS_NOT_INITIALIZED:
-    return "CUBLAS_STATUS_NOT_INITIALIZED";
-  case CUBLAS_STATUS_ALLOC_FAILED:
-    return "CUBLAS_STATUS_ALLOC_FAILED";
-  case CUBLAS_STATUS_INVALID_VALUE:
-    return "CUBLAS_STATUS_INVALID_VALUE";
-  case CUBLAS_STATUS_ARCH_MISMATCH:
-    return "CUBLAS_STATUS_ARCH_MISMATCH";
-  case CUBLAS_STATUS_MAPPING_ERROR:
-    return "CUBLAS_STATUS_MAPPING_ERROR";
-  case CUBLAS_STATUS_EXECUTION_FAILED:
-    return "CUBLAS_STATUS_EXECUTION_FAILED";
-  case CUBLAS_STATUS_INTERNAL_ERROR:
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
-#if CUDA_VERSION >= 6000
-  case CUBLAS_STATUS_NOT_SUPPORTED:
-    return "CUBLAS_STATUS_NOT_SUPPORTED";
-#endif
-#if CUDA_VERSION >= 6050
-  case CUBLAS_STATUS_LICENSE_ERROR:
-    return "CUBLAS_STATUS_LICENSE_ERROR";
-#endif
-  }
-  return "Unknown cublas status";
-}
-
-const char* curandGetErrorString(curandStatus_t error) {
-  switch (error) {
-  case CURAND_STATUS_SUCCESS:
-    return "CURAND_STATUS_SUCCESS";
-  case CURAND_STATUS_VERSION_MISMATCH:
-    return "CURAND_STATUS_VERSION_MISMATCH";
-  case CURAND_STATUS_NOT_INITIALIZED:
-    return "CURAND_STATUS_NOT_INITIALIZED";
-  case CURAND_STATUS_ALLOCATION_FAILED:
-    return "CURAND_STATUS_ALLOCATION_FAILED";
-  case CURAND_STATUS_TYPE_ERROR:
-    return "CURAND_STATUS_TYPE_ERROR";
-  case CURAND_STATUS_OUT_OF_RANGE:
-    return "CURAND_STATUS_OUT_OF_RANGE";
-  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-  case CURAND_STATUS_LAUNCH_FAILURE:
-    return "CURAND_STATUS_LAUNCH_FAILURE";
-  case CURAND_STATUS_PREEXISTING_FAILURE:
-    return "CURAND_STATUS_PREEXISTING_FAILURE";
-  case CURAND_STATUS_INITIALIZATION_FAILED:
-    return "CURAND_STATUS_INITIALIZATION_FAILED";
-  case CURAND_STATUS_ARCH_MISMATCH:
-    return "CURAND_STATUS_ARCH_MISMATCH";
-  case CURAND_STATUS_INTERNAL_ERROR:
-    return "CURAND_STATUS_INTERNAL_ERROR";
-  }
-  return "Unknown curand status";
-}
-
 #endif  // CPU_ONLY
 
 }  // namespace caffe
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 22633922..1137bac3 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -7,17 +7,17 @@
 #include "caffe/util/io.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/rng.hpp"
-
+#include "caffe/util/benchmark.hpp"
 namespace caffe {
 
-template<typename Dtype>
+template <typename Dtype>
 DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
     Phase phase)
     : param_(param), phase_(phase) {
   // check if we want to use mean_file
   if (param_.has_mean_file()) {
-    CHECK_EQ(param_.mean_value_size(), 0) <<
-      "Cannot specify mean_file and mean_value at the same time";
+    CHECK_EQ(param_.mean_value_size(), 0)
+        << "Cannot specify mean_file and mean_value at the same time";
     const string& mean_file = param.mean_file();
     LOG(INFO) << "Loading mean file from: " << mean_file;
     BlobProto blob_proto;
@@ -26,17 +26,17 @@ DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
   }
   // check if we want to use mean_value
   if (param_.mean_value_size() > 0) {
-    CHECK(param_.has_mean_file() == false) <<
-      "Cannot specify mean_file and mean_value at the same time";
+    CHECK(param_.has_mean_file() == false)
+        << "Cannot specify mean_file and mean_value at the same time";
     for (int c = 0; c < param_.mean_value_size(); ++c) {
       mean_values_.push_back(param_.mean_value(c));
     }
   }
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
-                                       Dtype* transformed_data) {
+    Dtype* transformed_data) {
   const string& data = datum.data();
   const int datum_channels = datum.channels();
   const int datum_height = datum.height();
@@ -61,8 +61,9 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
     mean = data_mean_.mutable_cpu_data();
   }
   if (has_mean_values) {
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) <<
-     "Specify either 1 mean_value or as many as channels: " << datum_channels;
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels)
+        << "Specify either 1 mean_value or as many as channels: "
+        << datum_channels;
     if (datum_channels > 1 && mean_values_.size() == 1) {
       // Replicate the mean_value for simplicity
       for (int c = 1; c < datum_channels; ++c) {
@@ -102,17 +103,17 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
         }
         if (has_uint8) {
           datum_element =
-            static_cast<Dtype>(static_cast<uint8_t>(data[data_index]));
+              static_cast<Dtype>(static_cast<uint8_t>(data[data_index]));
         } else {
           datum_element = datum.float_data(data_index);
         }
         if (has_mean_file) {
-          transformed_data[top_index] =
-            (datum_element - mean[data_index]) * scale;
+          transformed_data[top_index] = (datum_element - mean[data_index])
+              * scale;
         } else {
           if (has_mean_values) {
-            transformed_data[top_index] =
-              (datum_element - mean_values_[c]) * scale;
+            transformed_data[top_index] = (datum_element - mean_values_[c])
+                * scale;
           } else {
             transformed_data[top_index] = datum_element * scale;
           }
@@ -122,16 +123,17 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
   }
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
-                                       Blob<Dtype>* transformed_blob) {
+    Blob<Dtype>* transformed_blob) {
+
   // If datum is encoded, decoded and transform the cv::image.
   if (datum.encoded()) {
     CHECK(!(param_.force_color() && param_.force_gray()))
         << "cannot set both force_color and force_gray";
     cv::Mat cv_img;
     if (param_.force_color() || param_.force_gray()) {
-    // If force_color then decode in color otherwise decode in gray.
+      // If force_color then decode in color otherwise decode in gray.
       cv_img = DecodeDatumToCVMat(datum, param_.force_color());
     } else {
       cv_img = DecodeDatumToCVMatNative(datum);
@@ -172,9 +174,9 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
   Transform(datum, transformed_data);
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const vector<Datum> & datum_vector,
-                                       Blob<Dtype>* transformed_blob) {
+    Blob<Dtype>* transformed_blob) {
   const int datum_num = datum_vector.size();
   const int num = transformed_blob->num();
   const int channels = transformed_blob->channels();
@@ -182,9 +184,9 @@ void DataTransformer<Dtype>::Transform(const vector<Datum> & datum_vector,
   const int width = transformed_blob->width();
 
   CHECK_GT(datum_num, 0) << "There is no datum to add";
-  CHECK_LE(datum_num, num) <<
-    "The size of datum_vector must be no greater than transformed_blob->num()";
-  Blob<Dtype> uni_blob(1, channels, height, width);
+  CHECK_LE(datum_num, num)
+      << "The size of datum_vector must be no greater than transformed_blob->num()";
+  Blob < Dtype > uni_blob(1, channels, height, width);
   for (int item_id = 0; item_id < datum_num; ++item_id) {
     int offset = transformed_blob->offset(item_id);
     uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
@@ -192,9 +194,9 @@ void DataTransformer<Dtype>::Transform(const vector<Datum> & datum_vector,
   }
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const vector<cv::Mat> & mat_vector,
-                                       Blob<Dtype>* transformed_blob) {
+    Blob<Dtype>* transformed_blob) {
   const int mat_num = mat_vector.size();
   const int num = transformed_blob->num();
   const int channels = transformed_blob->channels();
@@ -202,9 +204,9 @@ void DataTransformer<Dtype>::Transform(const vector<cv::Mat> & mat_vector,
   const int width = transformed_blob->width();
 
   CHECK_GT(mat_num, 0) << "There is no MAT to add";
-  CHECK_EQ(mat_num, num) <<
-    "The size of mat_vector must be equals to transformed_blob->num()";
-  Blob<Dtype> uni_blob(1, channels, height, width);
+  CHECK_EQ(mat_num, num)
+      << "The size of mat_vector must be equals to transformed_blob->num()";
+  Blob < Dtype > uni_blob(1, channels, height, width);
   for (int item_id = 0; item_id < mat_num; ++item_id) {
     int offset = transformed_blob->offset(item_id);
     uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset);
@@ -212,9 +214,9 @@ void DataTransformer<Dtype>::Transform(const vector<cv::Mat> & mat_vector,
   }
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
-                                       Blob<Dtype>* transformed_blob) {
+    Blob<Dtype>* transformed_blob) {
   const int crop_size = param_.crop_size();
   const int img_channels = cv_img.channels();
   const int img_height = cv_img.rows;
@@ -250,8 +252,9 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
     mean = data_mean_.mutable_cpu_data();
   }
   if (has_mean_values) {
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) <<
-     "Specify either 1 mean_value or as many as channels: " << img_channels;
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels)
+        << "Specify either 1 mean_value or as many as channels: "
+        << img_channels;
     if (img_channels > 1 && mean_values_.size() == 1) {
       // Replicate the mean_value for simplicity
       for (int c = 1; c < img_channels; ++c) {
@@ -286,7 +289,7 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
   Dtype* transformed_data = transformed_blob->mutable_cpu_data();
   int top_index;
   for (int h = 0; h < height; ++h) {
-    const uchar* ptr = cv_cropped_img.ptr<uchar>(h);
+    const uchar* ptr = cv_cropped_img.ptr < uchar > (h);
     int img_index = 0;
     for (int w = 0; w < width; ++w) {
       for (int c = 0; c < img_channels; ++c) {
@@ -299,12 +302,10 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
         Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
         if (has_mean_file) {
           int mean_index = (c * img_height + h_off + h) * img_width + w_off + w;
-          transformed_data[top_index] =
-            (pixel - mean[mean_index]) * scale;
+          transformed_data[top_index] = (pixel - mean[mean_index]) * scale;
         } else {
           if (has_mean_values) {
-            transformed_data[top_index] =
-              (pixel - mean_values_[c]) * scale;
+            transformed_data[top_index] = (pixel - mean_values_[c]) * scale;
           } else {
             transformed_data[top_index] = pixel * scale;
           }
@@ -314,9 +315,9 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
   }
 }
 
-template<typename Dtype>
+template <typename Dtype>
 void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
-                                       Blob<Dtype>* transformed_blob) {
+    Blob<Dtype>* transformed_blob) {
   const int crop_size = param_.crop_size();
   const int input_num = input_blob->num();
   const int input_channels = input_blob->channels();
@@ -326,11 +327,11 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
   if (transformed_blob->count() == 0) {
     // Initialize transformed_blob with the right shape.
     if (crop_size) {
-      transformed_blob->Reshape(input_num, input_channels,
-                                crop_size, crop_size);
+      transformed_blob->Reshape(input_num, input_channels, crop_size,
+          crop_size);
     } else {
-      transformed_blob->Reshape(input_num, input_channels,
-                                input_height, input_width);
+      transformed_blob->Reshape(input_num, input_channels, input_height,
+          input_width);
     }
   }
 
@@ -345,7 +346,6 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
   CHECK_GE(input_height, height);
   CHECK_GE(input_width, width);
 
-
   const Dtype scale = param_.scale();
   const bool do_mirror = param_.mirror() && Rand(2);
   const bool has_mean_file = param_.has_mean_file();
@@ -376,14 +376,15 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
     CHECK_EQ(input_width, data_mean_.width());
     for (int n = 0; n < input_num; ++n) {
       int offset = input_blob->offset(n);
-      caffe_sub(data_mean_.count(), input_data + offset,
-            data_mean_.cpu_data(), input_data + offset);
+      caffe_sub(data_mean_.count(), input_data + offset, data_mean_.cpu_data(),
+          input_data + offset);
     }
   }
 
   if (has_mean_values) {
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) <<
-     "Specify either 1 mean_value or as many as channels: " << input_channels;
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels)
+        << "Specify either 1 mean_value or as many as channels: "
+        << input_channels;
     if (mean_values_.size() == 1) {
       caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data);
     } else {
@@ -391,7 +392,7 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
         for (int c = 0; c < input_channels; ++c) {
           int offset = input_blob->offset(n, c);
           caffe_add_scalar(input_height * input_width, -(mean_values_[c]),
-            input_data + offset);
+              input_data + offset);
         }
       }
     }
@@ -411,7 +412,7 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
         if (do_mirror) {
           int top_index_w = top_index_h + width - 1;
           for (int w = 0; w < width; ++w) {
-            transformed_data[top_index_w-w] = input_data[data_index_h + w];
+            transformed_data[top_index_w - w] = input_data[data_index_h + w];
           }
         } else {
           for (int w = 0; w < width; ++w) {
@@ -427,14 +428,14 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
   }
 }
 
-template<typename Dtype>
+template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
   if (datum.encoded()) {
     CHECK(!(param_.force_color() && param_.force_gray()))
         << "cannot set both force_color and force_gray";
     cv::Mat cv_img;
     if (param_.force_color() || param_.force_gray()) {
-    // If force_color then decode in color otherwise decode in gray.
+      // If force_color then decode in color otherwise decode in gray.
       cv_img = DecodeDatumToCVMat(datum, param_.force_color());
     } else {
       cv_img = DecodeDatumToCVMatNative(datum);
@@ -455,12 +456,12 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
   vector<int> shape(4);
   shape[0] = 1;
   shape[1] = datum_channels;
-  shape[2] = (crop_size)? crop_size: datum_height;
-  shape[3] = (crop_size)? crop_size: datum_width;
+  shape[2] = (crop_size) ? crop_size : datum_height;
+  shape[3] = (crop_size) ? crop_size : datum_width;
   return shape;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(
     const vector<Datum> & datum_vector) {
   const int num = datum_vector.size();
@@ -472,7 +473,7 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(
   return shape;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const cv::Mat& cv_img) {
   const int crop_size = param_.crop_size();
   const int img_channels = cv_img.channels();
@@ -486,12 +487,12 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(const cv::Mat& cv_img) {
   vector<int> shape(4);
   shape[0] = 1;
   shape[1] = img_channels;
-  shape[2] = (crop_size)? crop_size: img_height;
-  shape[3] = (crop_size)? crop_size: img_width;
+  shape[2] = (crop_size) ? crop_size : img_height;
+  shape[3] = (crop_size) ? crop_size : img_width;
   return shape;
 }
 
-template<typename Dtype>
+template <typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(
     const vector<cv::Mat> & mat_vector) {
   const int num = mat_vector.size();
@@ -505,8 +506,8 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(
 
 template <typename Dtype>
 void DataTransformer<Dtype>::InitRand() {
-  const bool needs_rand = param_.mirror() ||
-      (phase_ == TRAIN && param_.crop_size());
+  const bool needs_rand = param_.mirror()
+      || (phase_ == TRAIN && param_.crop_size());
   if (needs_rand) {
     const unsigned int rng_seed = caffe_rng_rand();
     rng_.reset(new Caffe::RNG(rng_seed));
@@ -517,13 +518,12 @@ void DataTransformer<Dtype>::InitRand() {
 
 template <typename Dtype>
 int DataTransformer<Dtype>::Rand(int n) {
-  CHECK(rng_);
+  CHECK (rng_);
   CHECK_GT(n, 0);
-  caffe::rng_t* rng =
-      static_cast<caffe::rng_t*>(rng_->generator());
+  caffe::rng_t* rng = static_cast<caffe::rng_t*>(rng_->generator());
   return ((*rng)() % n);
 }
 
-INSTANTIATE_CLASS(DataTransformer);
+INSTANTIATE_CLASS (DataTransformer);
 
 }  // namespace caffe
diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp
new file mode 100644
index 00000000..fcbffe09
--- /dev/null
+++ b/src/caffe/device.cpp
@@ -0,0 +1,426 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+#include "caffe/common.hpp"
+#include "caffe/device.hpp"
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <malloc.h>
+#include <dirent.h>
+
+namespace caffe {
+#ifndef CPU_ONLY
+string buildOption = "-x clc++ ";
+std::string oclKernelPath = "./src/caffe/ocl/";
+Device amdDevice;
+
+Device::~Device() {
+  ReleaseKernels();
+  free((void*) platformIDs);
+  free (DeviceIDs);
+  clReleaseProgram (Program);
+  clReleaseCommandQueue (CommandQueue);
+  clReleaseCommandQueue (CommandQueue_helper);
+  clReleaseContext (Context);
+  LOG(INFO) << "device destructor";
+}
+
+cl_int Device::Init(int deviceId) {
+
+  DisplayPlatformInfo();
+
+  clGetPlatformIDs(0, NULL, &numPlatforms);
+  cl_platform_id PlatformIDs[numPlatforms];
+  clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
+
+  size_t nameLen;
+  cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64,
+      platformName, &nameLen);
+  if (res != CL_SUCCESS) {
+    fprintf(stderr, "Err: Failed to Get Platform Info\n");
+    return 0;
+  }
+  platformName[nameLen] = 0;
+
+  GetDeviceInfo();
+  cl_uint uiNumDevices;
+  cl_bool unified_memory = false;
+  clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+  uiNumDevices = numDevices;
+  if (0 == uiNumDevices) {
+    LOG(FATAL) << "Err: No GPU devices";
+  } else {
+    pDevices = (cl_device_id *) malloc(uiNumDevices * sizeof(cl_device_id));
+    OCL_CHECK(
+        clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices,
+            pDevices, &uiNumDevices));
+    if (deviceId == -1) {
+      int i;
+      for (i = 0; i < (int) uiNumDevices; i++) {
+        clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY,
+            sizeof(cl_bool), &unified_memory, NULL);
+        if (!unified_memory) { //skip iGPU
+          //we pick the first dGPU we found
+          pDevices[0] = pDevices[i];
+          device_id = i;
+          LOG(INFO) << "Picked default device type : dGPU " << device_id;
+          break;
+        }
+      }
+      if (i == uiNumDevices) {
+        LOG(FATAL) << "Cannot find any dGPU! ";
+      }
+    } else if (deviceId >= 0 && deviceId < uiNumDevices) {
+      pDevices[0] = pDevices[deviceId];
+      device_id = deviceId;
+      LOG(INFO) << "Picked device type : GPU " << device_id;
+    } else {
+      LOG(FATAL) << "  Invalid GPU deviceId! ";
+    }
+  }
+
+  Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL);
+  if (NULL == Context) {
+    fprintf(stderr, "Err: Failed to Create Context\n");
+    return 0;
+  }
+  CommandQueue = clCreateCommandQueue(Context, pDevices[0],
+      CL_QUEUE_PROFILING_ENABLE, NULL);
+  CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0],
+      CL_QUEUE_PROFILING_ENABLE, NULL);
+  if (NULL == CommandQueue || NULL == CommandQueue_helper) {
+    fprintf(stderr, "Err: Failed to Create Commandqueue\n");
+    return 0;
+  }
+  BuildProgram (oclKernelPath);
+  row = clblasRowMajor;
+  col = clblasColumnMajor;
+  return 0;
+}
+
+void Device::BuildProgram(std::string kernel_dir) {
+  std::string strSource = "";
+  DIR *ocl_dir;
+  struct dirent *dirp;
+  if ((ocl_dir = opendir(kernel_dir.c_str())) == NULL) {
+    fprintf(stderr, "Err: Open ocl dir failed!\n");
+  }
+  while ((dirp = readdir(ocl_dir)) != NULL) {
+    //Ignore hidden files
+    if (dirp->d_name[0] == '.')
+      continue;
+    std::string file_name = std::string(dirp->d_name);
+    //Skip non *.cl files
+    size_t last_dot_pos = file_name.find_last_of(".");
+    if (file_name.substr(last_dot_pos + 1) != "cl")
+      continue;
+
+    std::string ocl_kernel_full_path = kernel_dir + file_name;
+    std::string tmpSource = "";
+    ConvertToString(ocl_kernel_full_path.c_str(), tmpSource);
+    strSource += tmpSource;
+  }
+  const char *pSource;
+  pSource = strSource.c_str();
+  size_t uiArrSourceSize[] = { 0 };
+  uiArrSourceSize[0] = strlen(pSource);
+  Program = NULL;
+  Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize,
+      NULL);
+  if (NULL == Program) {
+    fprintf(stderr, "Err: Failed to create program\n");
+  }
+  cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(),
+      NULL, NULL);
+  LOG(INFO) << "Build Program";
+  if (CL_SUCCESS != iStatus) {
+    fprintf(stderr, "Err: Failed to build program\n");
+    char szBuildLog[16384];
+    clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG,
+        sizeof(szBuildLog), szBuildLog, NULL);
+    std::cout << szBuildLog;
+    clReleaseProgram (Program);
+  }
+}
+
+//Use to read OpenCL source code
+cl_int Device::ConvertToString(std::string pFileName, std::string &Str) {
+  size_t uiSize = 0;
+  size_t uiFileSize = 0;
+  char *pStr = NULL;
+  char *tmp = (char*) pFileName.data();
+  std::fstream fFile(tmp, (std::fstream::in | std::fstream::binary));
+  if (fFile.is_open()) {
+    fFile.seekg(0, std::fstream::end);
+    uiSize = uiFileSize = (size_t) fFile.tellg();
+    fFile.seekg(0, std::fstream::beg);
+    pStr = new char[uiSize + 1];
+
+    if (NULL == pStr) {
+      fFile.close();
+      return 0;
+    }
+    fFile.read(pStr, uiFileSize);
+    fFile.close();
+    pStr[uiSize] = '\0';
+    Str = pStr;
+    delete[] pStr;
+    return 0;
+  }
+  LOG(ERROR) << "Err: Failed to open cl file!";
+  return -1;
+}
+
+cl_kernel Device::GetKernel(std::string kernel_name) {
+  std::map<std::string, cl_kernel>::iterator it = Kernels.find(kernel_name);
+  if (it == Kernels.end()) {
+    cl_int _err = 0;
+    cl_kernel kernel = clCreateKernel(Program, kernel_name.c_str(), &_err);
+    OCL_CHECK(_err);
+    Kernels[kernel_name] = kernel;
+  }
+  return Kernels[kernel_name];
+}
+
+void Device::ReleaseKernels() {
+  std::map<std::string, cl_kernel>::iterator it;
+  for (it = Kernels.begin(); it != Kernels.end(); it++) {
+    clReleaseKernel(it->second);
+  }
+}
+
+void Device::DisplayPlatformInfo() {
+  cl_int err;
+
+  err = clGetPlatformIDs(0, NULL, &numPlatforms);
+  if (err != CL_SUCCESS || numPlatforms <= 0) {
+    LOG(ERROR) << "Failed to find any OpenCL platform.";
+    return;
+  }
+
+  platformIDs = (cl_platform_id *) malloc(
+      sizeof(cl_platform_id) * numPlatforms);
+  err = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "Failed to find any OpenCL platform.";
+    return;
+  }
+
+  LOG(INFO) << "Number of platforms found:" << numPlatforms;
+
+  //iterate through the list of platforms displaying platform information
+  for (cl_uint i = 0; i < numPlatforms; i++) {
+    DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME");
+    DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE");
+    DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION");
+    DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR");
+    DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS,
+        "CL_PLATFORM_EXTENSIONS");
+  }
+
+}
+
+void Device::DisplayInfo(cl_platform_id id, cl_platform_info name,
+    std::string str) {
+  cl_int err;
+  std::size_t paramValueSize;
+
+  err = clGetPlatformInfo(id, name, 0, NULL, &paramValueSize);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "Failed to find OpenCL platform:" << str;
+    return;
+  }
+
+  char * info = (char *) alloca(sizeof(char) * paramValueSize);
+  err = clGetPlatformInfo(id, name, paramValueSize, info, NULL);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "Failed to find OpenCL platform:" << str;
+    return;
+  }
+
+  LOG(INFO) << "\t" << str << "\t" << info;
+}
+
+void Device::GetDeviceInfo() {
+  cl_int err;
+  //by default, we select the first platform. can be extended for more platforms
+  //query GPU device for now
+  err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL,
+      &numDevices);
+  // we allow program run if no GPU is found. Just return. No error reported.
+  if (numDevices < 1) {
+    LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0];
+    LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0];
+    return;
+  }
+
+  DeviceIDs = (cl_device_id *) malloc(sizeof(cl_device_id) * numDevices);
+  err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices,
+      DeviceIDs, NULL);
+  if (err != CL_SUCCESS) {
+    LOG(INFO) << "Failed to find any GPU devices.";
+    return;
+  }
+
+  LOG(INFO) << "Number of devices found:" << numDevices;
+  for (cl_uint i = 0; i < numDevices; i++) {
+    LOG(INFO) << "\t" << "DeviceID" << ":\t" << DeviceIDs[i];
+    DisplayDeviceInfo < cl_device_type
+        > (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type");
+    DisplayDeviceInfo < cl_bool
+        > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?");
+    DisplayDeviceInfo < cl_uint
+        > (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz");
+    DisplayDeviceInfo < cl_bool
+        > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem");
+    DisplayDeviceInfo < cl_bool
+        > (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support");
+    DisplayDeviceInfo < cl_bool
+        > (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little");
+    DisplayDeviceInfo < cl_uint
+        > (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units");
+    DisplayDeviceInfo < size_t
+        > (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size");
+    DisplayDeviceInfo < cl_uint
+        > (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions");
+    DisplayDeviceInfo<size_t *>(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES,
+        "Max work item sizes");
+    DisplayDeviceInfo < cl_command_queue_properties
+        > (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES");
+    DisplayDeviceInfo < cl_device_exec_capabilities
+        > (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES");
+    DisplayDeviceInfo < cl_ulong
+        > (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size");
+    DisplayDeviceInfo < cl_ulong
+        > (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size");
+    DisplayDeviceInfo < cl_ulong
+        > (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size");
+  }
+
+}
+
+void Device::DeviceQuery() {
+  DisplayPlatformInfo();
+
+  clGetPlatformIDs(0, NULL, &numPlatforms);
+  cl_platform_id PlatformIDs[numPlatforms];
+  clGetPlatformIDs(numPlatforms, PlatformIDs, NULL);
+
+  size_t nameLen;
+  cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64,
+      platformName, &nameLen);
+  if (res != CL_SUCCESS) {
+    fprintf(stderr, "Err: Failed to Get Platform Info\n");
+    return;
+  }
+  platformName[nameLen] = 0;
+
+  GetDeviceInfo();
+}
+
+template <typename T>
+void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name,
+    std::string str) {
+  cl_int err;
+  std::size_t paramValueSize;
+
+  err = clGetDeviceInfo(id, name, 0, NULL, &paramValueSize);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "Failed to find OpenCL device info:" << str;
+    return;
+  }
+
+  std::string content;
+  T * info = (T *) alloca(sizeof(T) * paramValueSize);
+  err = clGetDeviceInfo(id, name, paramValueSize, info, NULL);
+  if (err != CL_SUCCESS) {
+    LOG(ERROR) << "Failed to find OpenCL device info:" << str;
+    return;
+  }
+
+  switch (name) {
+  case CL_DEVICE_TYPE: {
+    std::string deviceType;
+    appendBitfield < cl_device_type
+        > (*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType);
+
+    appendBitfield < cl_device_type
+        > (*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType);
+
+    appendBitfield < cl_device_type
+        > (*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType);
+
+    appendBitfield < cl_device_type
+        > (*(reinterpret_cast<cl_device_type*>(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType);
+
+    LOG(INFO) << "\t " << str << ":\t" << deviceType;
+  }
+    break;
+  case CL_DEVICE_EXECUTION_CAPABILITIES: {
+    std::string memType;
+    appendBitfield < cl_device_exec_capabilities
+        > (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType);
+
+    appendBitfield < cl_device_exec_capabilities
+        > (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType);
+
+    LOG(INFO) << "\t " << str << ":\t" << memType;
+
+  }
+    break;
+  case CL_DEVICE_QUEUE_PROPERTIES: {
+    std::string memType;
+    appendBitfield < cl_device_exec_capabilities
+        > (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType);
+
+    appendBitfield < cl_device_exec_capabilities
+        > (*(reinterpret_cast<cl_device_exec_capabilities*>(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType);
+
+    LOG(INFO) << "\t " << str << ":\t" << memType;
+  }
+    break;
+  default:
+    LOG(INFO) << "\t" << str << ":\t" << *info;
+    break;
+  }
+
+}
+
+template <typename T>
+void Device::appendBitfield(T info, T value, std::string name,
+    std::string &str) {
+  if (info & value) {
+    if (str.length() > 0) {
+      str.append(" | ");
+    }
+    str.append(name);
+  }
+}
+
+#endif
+}  // namespace caffe
+
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index c2d19d43..fb512847 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -11,7 +11,6 @@ bool InternalThread::is_started() const {
   return thread_.get() != NULL && thread_->joinable();
 }
 
-
 bool InternalThread::StartInternalThread() {
   if (!WaitForInternalThreadToExit()) {
     return false;
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index 926c7d8f..44233c98 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -18,8 +18,7 @@ namespace caffe {
 
 // Get convolution layer according to engine.
 template <typename Dtype>
-shared_ptr<Layer<Dtype> > GetConvolutionLayer(
-    const LayerParameter& param) {
+shared_ptr<Layer<Dtype> > GetConvolutionLayer(const LayerParameter& param) {
   ConvolutionParameter_Engine engine = param.convolution_param().engine();
   if (engine == ConvolutionParameter_Engine_DEFAULT) {
     engine = ConvolutionParameter_Engine_CAFFE;
@@ -28,7 +27,7 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
 #endif
   }
   if (engine == ConvolutionParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new ConvolutionLayer<Dtype>(param));
+    return shared_ptr < Layer<Dtype> > (new ConvolutionLayer<Dtype>(param));
 #ifdef USE_CUDNN
   } else if (engine == ConvolutionParameter_Engine_CUDNN) {
     return shared_ptr<Layer<Dtype> >(new CuDNNConvolutionLayer<Dtype>(param));
@@ -51,14 +50,14 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
 #endif
   }
   if (engine == PoolingParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
+    return shared_ptr < Layer<Dtype> > (new PoolingLayer<Dtype>(param));
 #ifdef USE_CUDNN
   } else if (engine == PoolingParameter_Engine_CUDNN) {
     PoolingParameter p_param = param.pooling_param();
     if (p_param.pad() || p_param.pad_h() || p_param.pad_w() ||
         param.top_size() > 1) {
       LOG(INFO) << "CUDNN does not support padding or multiple tops. "
-                << "Using Caffe's own pooling layer.";
+      << "Using Caffe's own pooling layer.";
       return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
     }
     return shared_ptr<Layer<Dtype> >(new CuDNNPoolingLayer<Dtype>(param));
@@ -81,7 +80,7 @@ shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) {
 #endif
   }
   if (engine == ReLUParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new ReLULayer<Dtype>(param));
+    return shared_ptr < Layer<Dtype> > (new ReLULayer<Dtype>(param));
 #ifdef USE_CUDNN
   } else if (engine == ReLUParameter_Engine_CUDNN) {
     return shared_ptr<Layer<Dtype> >(new CuDNNReLULayer<Dtype>(param));
@@ -104,7 +103,7 @@ shared_ptr<Layer<Dtype> > GetSigmoidLayer(const LayerParameter& param) {
 #endif
   }
   if (engine == SigmoidParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new SigmoidLayer<Dtype>(param));
+    return shared_ptr < Layer<Dtype> > (new SigmoidLayer<Dtype>(param));
 #ifdef USE_CUDNN
   } else if (engine == SigmoidParameter_Engine_CUDNN) {
     return shared_ptr<Layer<Dtype> >(new CuDNNSigmoidLayer<Dtype>(param));
@@ -127,7 +126,7 @@ shared_ptr<Layer<Dtype> > GetSoftmaxLayer(const LayerParameter& param) {
 #endif
   }
   if (engine == SoftmaxParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new SoftmaxLayer<Dtype>(param));
+    return shared_ptr < Layer<Dtype> > (new SoftmaxLayer<Dtype>(param));
 #ifdef USE_CUDNN
   } else if (engine == SoftmaxParameter_Engine_CUDNN) {
     return shared_ptr<Layer<Dtype> >(new CuDNNSoftmaxLayer<Dtype>(param));
@@ -150,7 +149,7 @@ shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) {
 #endif
   }
   if (engine == TanHParameter_Engine_CAFFE) {
-    return shared_ptr<Layer<Dtype> >(new TanHLayer<Dtype>(param));
+    return shared_ptr < Layer<Dtype> > (new TanHLayer<Dtype>(param));
 #ifdef USE_CUDNN
   } else if (engine == TanHParameter_Engine_CUDNN) {
     return shared_ptr<Layer<Dtype> >(new CuDNNTanHLayer<Dtype>(param));
@@ -181,4 +180,5 @@ REGISTER_LAYER_CREATOR(Python, GetPythonLayer);
 
 // Layers that use their constructor as their default creator should be
 // registered in their corresponding cpp files. Do not register them here.
-}  // namespace caffe
+}
+ // namespace caffe
diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp
index 5ce28c9e..20898f15 100644
--- a/src/caffe/layers/absval_layer.cpp
+++ b/src/caffe/layers/absval_layer.cpp
@@ -8,15 +8,15 @@ namespace caffe {
 
 template <typename Dtype>
 void AbsValLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
   CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not "
-    "allow in-place computation.";
+      "allow in-place computation.";
 }
 
 template <typename Dtype>
-void AbsValLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+void AbsValLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
   const int count = top[0]->count();
   Dtype* top_data = top[0]->mutable_cpu_data();
   caffe_abs(count, bottom[0]->cpu_data(), top_data);
@@ -35,11 +35,35 @@ void AbsValLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void AbsValLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int count = top[0]->count();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);
+}
+
+template <typename Dtype>
+void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const int count = top[0]->count();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_gpu_sign(count, bottom_data, bottom_diff);
+    caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(AbsValLayer);
 #endif
 
-INSTANTIATE_CLASS(AbsValLayer);
-REGISTER_LAYER_CLASS(AbsVal);
+INSTANTIATE_CLASS (AbsValLayer);
+REGISTER_LAYER_CLASS (AbsVal);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/absval_layer.cu b/src/caffe/layers/absval_layer.cu
deleted file mode 100644
index bb310e1a..00000000
--- a/src/caffe/layers/absval_layer.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void AbsValLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);
-}
-
-template <typename Dtype>
-void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const int count = top[0]->count();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_sign(count, bottom_data, bottom_diff);
-    caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(AbsValLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 90aad675..4cfc96f8 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -11,24 +11,23 @@
 namespace caffe {
 
 template <typename Dtype>
-void AccuracyLayer<Dtype>::LayerSetUp(
-  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+void AccuracyLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
   top_k_ = this->layer_param_.accuracy_param().top_k();
 
-  has_ignore_label_ =
-    this->layer_param_.accuracy_param().has_ignore_label();
+  has_ignore_label_ = this->layer_param_.accuracy_param().has_ignore_label();
   if (has_ignore_label_) {
     ignore_label_ = this->layer_param_.accuracy_param().ignore_label();
   }
 }
 
 template <typename Dtype>
-void AccuracyLayer<Dtype>::Reshape(
-  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+void AccuracyLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
   CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count())
       << "top_k must be less than or equal to the number of classes.";
-  label_axis_ =
-      bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis());
+  label_axis_ = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.accuracy_param().axis());
   outer_num_ = bottom[0]->count(0, label_axis_);
   inner_num_ = bottom[0]->count(label_axis_ + 1);
   CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
@@ -48,27 +47,26 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const Dtype* bottom_label = bottom[1]->cpu_data();
   const int dim = bottom[0]->count() / outer_num_;
   const int num_labels = bottom[0]->shape(label_axis_);
-  vector<Dtype> maxval(top_k_+1);
-  vector<int> max_id(top_k_+1);
+  vector < Dtype > maxval(top_k_ + 1);
+  vector<int> max_id(top_k_ + 1);
   int count = 0;
   for (int i = 0; i < outer_num_; ++i) {
     for (int j = 0; j < inner_num_; ++j) {
-      const int label_value =
-          static_cast<int>(bottom_label[i * inner_num_ + j]);
+      const int label_value = static_cast<int>(bottom_label[i * inner_num_ + j]);
       if (has_ignore_label_ && label_value == ignore_label_) {
         continue;
       }
       DCHECK_GE(label_value, 0);
       DCHECK_LT(label_value, num_labels);
       // Top-k accuracy
-      std::vector<std::pair<Dtype, int> > bottom_data_vector;
+      std::vector < std::pair<Dtype, int> > bottom_data_vector;
       for (int k = 0; k < num_labels; ++k) {
-        bottom_data_vector.push_back(std::make_pair(
-            bottom_data[i * dim + k * inner_num_ + j], k));
+        bottom_data_vector.push_back(
+            std::make_pair(bottom_data[i * dim + k * inner_num_ + j], k));
       }
-      std::partial_sort(
-          bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
-          bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
+      std::partial_sort(bottom_data_vector.begin(),
+          bottom_data_vector.begin() + top_k_, bottom_data_vector.end(),
+          std::greater<std::pair<Dtype, int> >());
       // check if true label is in top k predictions
       for (int k = 0; k < top_k_; k++) {
         if (bottom_data_vector[k].second == label_value) {
@@ -85,7 +83,7 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   // Accuracy layer should not be used as a loss function.
 }
 
-INSTANTIATE_CLASS(AccuracyLayer);
-REGISTER_LAYER_CLASS(Accuracy);
+INSTANTIATE_CLASS (AccuracyLayer);
+REGISTER_LAYER_CLASS (Accuracy);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/argmax_layer.cpp b/src/caffe/layers/argmax_layer.cpp
index c4040cdc..7b37283d 100644
--- a/src/caffe/layers/argmax_layer.cpp
+++ b/src/caffe/layers/argmax_layer.cpp
@@ -10,7 +10,7 @@ namespace caffe {
 
 template <typename Dtype>
 void ArgMaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   out_max_val_ = this->layer_param_.argmax_param().out_max_val();
   top_k_ = this->layer_param_.argmax_param().top_k();
   CHECK_GE(top_k_, 1) << " top k must not be less than 1.";
@@ -20,7 +20,7 @@ void ArgMaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void ArgMaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   if (out_max_val_) {
     // Produces max_ind and max_val
     top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1);
@@ -38,14 +38,13 @@ void ArgMaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   int num = bottom[0]->num();
   int dim = bottom[0]->count() / bottom[0]->num();
   for (int i = 0; i < num; ++i) {
-    std::vector<std::pair<Dtype, int> > bottom_data_vector;
+    std::vector < std::pair<Dtype, int> > bottom_data_vector;
     for (int j = 0; j < dim; ++j) {
-      bottom_data_vector.push_back(
-          std::make_pair(bottom_data[i * dim + j], j));
+      bottom_data_vector.push_back(std::make_pair(bottom_data[i * dim + j], j));
     }
-    std::partial_sort(
-        bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
-        bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
+    std::partial_sort(bottom_data_vector.begin(),
+        bottom_data_vector.begin() + top_k_, bottom_data_vector.end(),
+        std::greater<std::pair<Dtype, int> >());
     for (int j = 0; j < top_k_; ++j) {
       top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second;
     }
@@ -57,7 +56,7 @@ void ArgMaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
-INSTANTIATE_CLASS(ArgMaxLayer);
-REGISTER_LAYER_CLASS(ArgMax);
+INSTANTIATE_CLASS (ArgMaxLayer);
+REGISTER_LAYER_CLASS (ArgMax);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index ccb3adc7..5d99e04d 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -5,29 +5,77 @@
 #include "caffe/util/im2col.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/vision_layers.hpp"
+#include "caffe/common.hpp"
 
 namespace caffe {
 
+#ifndef CPU_ONLY
+#ifdef use_packing_scheme
+template <typename Dtype> size_t BaseConvolutionLayer<Dtype>::subtop_mem_size = sizeof(Dtype);
+template <typename Dtype> size_t BaseConvolutionLayer<Dtype>::trans_mem_size = sizeof(Dtype);
+template <typename Dtype> cl_mem BaseConvolutionLayer<Dtype>::subTopMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer<Dtype>::subtop_mem_size, NULL, NULL);
+template <typename Dtype> cl_mem BaseConvolutionLayer<Dtype>::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer<Dtype>::trans_mem_size, NULL, NULL);
+#endif
+
+template <typename Dtype>
+void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) {
+  if (subtop_size > BaseConvolutionLayer < Dtype > ::subtop_mem_size) {
+    ConvolutionLayer < Dtype > ::subtop_mem_size = subtop_size;
+    clReleaseMemObject(ConvolutionLayer < Dtype > ::subTopMem);
+    ConvolutionLayer < Dtype > ::subTopMem = clCreateBuffer(amdDevice.Context,
+        CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size,
+        NULL, NULL);
+  }
+  if (trans_size > ConvolutionLayer < Dtype > ::trans_mem_size) {
+    ConvolutionLayer < Dtype > ::trans_mem_size = trans_size;
+    clReleaseMemObject(ConvolutionLayer < Dtype > ::transMem);
+    ConvolutionLayer < Dtype > ::transMem = clCreateBuffer(amdDevice.Context,
+        CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size,
+        NULL, NULL);
+  }
+}
+
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::ocl_setup() {
+  M_ = num_output_ / group_;
+  K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_;
+  N_ = height_out_ * width_out_;
+#ifdef use_packing_scheme
+  size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype));
+  size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype));
+  Alloc_public_tmp_mem<Dtype>(subtop_size, trans_size);
+#endif
+}
+
+#endif
+
+template <typename Dtype>
+BaseConvolutionLayer<Dtype>::~BaseConvolutionLayer() {
+}
+
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
       << "corresponding to (num, channels, height, width)";
   // Configure the kernel size, padding, stride, and inputs.
   ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-  CHECK(!conv_param.has_kernel_size() !=
-      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+  CHECK(
+      !conv_param.has_kernel_size()
+          != !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
       << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-  CHECK(conv_param.has_kernel_size() ||
-      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+  CHECK(
+      conv_param.has_kernel_size()
+          || (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
       << "For non-square filters both kernel_h and kernel_w are required.";
-  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
-      && conv_param.has_pad_w())
-      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+  CHECK(
+      (!conv_param.has_pad() && conv_param.has_pad_h() && conv_param.has_pad_w())
+          || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
       << "pad is pad OR pad_h and pad_w are required.";
-  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
-      && conv_param.has_stride_w())
-      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+  CHECK(
+      (!conv_param.has_stride() && conv_param.has_stride_h()
+          && conv_param.has_stride_w())
+          || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
       << "Stride is stride OR stride_h and stride_w are required.";
   if (conv_param.has_kernel_size()) {
     kernel_h_ = kernel_w_ = conv_param.kernel_size();
@@ -51,8 +99,8 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   }
   // Special case: im2col is the identity for 1x1 convolution with stride 1
   // and no padding, so flag for skipping the buffer and transformation.
-  is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1
-      && stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0;
+  is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1 && stride_h_ == 1 && stride_w_ == 1
+      && pad_h_ == 0 && pad_w_ == 0;
   // Configure output channels and groups.
   channels_ = bottom[0]->channels();
   num_output_ = this->layer_param_.convolution_param().num_output();
@@ -68,6 +116,7 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     conv_out_channels_ = num_output_;
     conv_in_channels_ = channels_;
   }
+
   // Handle the parameters: weights and biases.
   // - blobs_[0] holds the filter weights
   // - blobs_[1] holds the biases (optional)
@@ -82,17 +131,22 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     }
     // Initialize and fill the weights:
     // output channels x input channels per-group x kernel height x kernel width
-    this->blobs_[0].reset(new Blob<Dtype>(
-        conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_));
-    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
-        this->layer_param_.convolution_param().weight_filler()));
+    this->blobs_[0].reset(
+        new Blob<Dtype>(conv_out_channels_, conv_in_channels_ / group_,
+            kernel_h_, kernel_w_));
+    shared_ptr < Filler<Dtype>
+        > weight_filler(
+            GetFiller < Dtype
+                > (this->layer_param_.convolution_param().weight_filler()));
     weight_filler->Fill(this->blobs_[0].get());
     // If necessary, initialize and fill the biases.
     if (bias_term_) {
       vector<int> bias_shape(1, num_output_);
       this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
-      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
-          this->layer_param_.convolution_param().bias_filler()));
+      shared_ptr < Filler<Dtype>
+          > bias_filler(
+              GetFiller < Dtype
+                  > (this->layer_param_.convolution_param().bias_filler()));
       bias_filler->Fill(this->blobs_[1].get());
     }
   }
@@ -102,14 +156,14 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
       << "corresponding to (num, channels, height, width)";
   num_ = bottom[0]->num();
   height_ = bottom[0]->height();
   width_ = bottom[0]->width();
   CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
-    " convolution kernel.";
+      " convolution kernel.";
   // TODO: generalize to handle inputs of different shapes.
   for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
     CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
@@ -153,6 +207,10 @@ void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     caffe_set(bias_multiplier_.count(), Dtype(1),
         bias_multiplier_.mutable_cpu_data());
   }
+#ifndef CPU_ONLY
+  //initializa OpenCL kernels and cl_mem objects
+  ocl_setup();
+#endif
 }
 
 template <typename Dtype>
@@ -166,19 +224,17 @@ void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,
     col_buff = col_buffer_.cpu_data();
   }
   for (int g = 0; g < group_; ++g) {
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-        group_, conv_out_spatial_dim_, kernel_dim_ / group_,
-        (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
-        (Dtype)0., output + output_offset_ * g);
+    caffe_cpu_gemm <Dtype> (CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_
+            / group_, (Dtype) 1., weights + weight_offset_ * g, col_buff
+            + col_offset_ * g, (Dtype) 0., output + output_offset_ * g);
   }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output,
     const Dtype* bias) {
-  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-      height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(),
-      (Dtype)1., output);
+  caffe_cpu_gemm < Dtype
+      > (CblasNoTrans, CblasNoTrans, num_output_, height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(), (Dtype) 1., output);
 }
 
 template <typename Dtype>
@@ -189,10 +245,10 @@ void BaseConvolutionLayer<Dtype>::backward_cpu_gemm(const Dtype* output,
     col_buff = input;
   }
   for (int g = 0; g < group_; ++g) {
-    caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_ / group_,
-        conv_out_spatial_dim_, conv_out_channels_ / group_,
-        (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
-        (Dtype)0., col_buff + col_offset_ * g);
+    caffe_cpu_gemm < Dtype
+        > (CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_
+            / group_, (Dtype) 1., weights + weight_offset_ * g, output
+            + output_offset_ * g, (Dtype) 0., col_buff + col_offset_ * g);
   }
   if (!is_1x1_) {
     conv_col2im_cpu(col_buff, input);
@@ -208,18 +264,19 @@ void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input,
     col_buff = col_buffer_.cpu_data();
   }
   for (int g = 0; g < group_; ++g) {
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
-        kernel_dim_ / group_, conv_out_spatial_dim_,
-        (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
-        (Dtype)1., weights + weight_offset_ * g);
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_
+            / group_, conv_out_spatial_dim_, (Dtype) 1., output
+            + output_offset_ * g, col_buff + col_offset_ * g, (Dtype) 1., weights
+            + weight_offset_ * g);
   }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_cpu_bias(Dtype* bias,
     const Dtype* input) {
-  caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
-      input, bias_multiplier_.cpu_data(), 1., bias);
+  caffe_cpu_gemv < Dtype
+      > (CblasNoTrans, num_output_, height_out_ * width_out_, 1., input, bias_multiplier_.cpu_data(), 1., bias);
 }
 
 #ifndef CPU_ONLY
@@ -233,21 +290,21 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
       conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
     }
     col_buff = col_buffer_.gpu_data();
-  }
+  } 
   for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-        group_, conv_out_spatial_dim_, kernel_dim_ / group_,
-        (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
-        (Dtype)0., output + output_offset_ * g);
+     caffe_gpu_gemm < Dtype > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_
+            / group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype) 1., weights, weight_offset_
+            * g, col_buff, is_1x1_ * bottom_offset_ + col_offset_ * g, (Dtype) 0., output, top_offset_
+            + output_offset_ * g);
   }
+  
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
     const Dtype* bias) {
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-      height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(),
-      (Dtype)1., output);
+  caffe_gpu_gemm < Dtype
+      > (CblasNoTrans, CblasNoTrans, num_output_, height_out_ * width_out_, 1, (Dtype) 1., bias, 0, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0, (Dtype) 1., output, top_offset_);
 }
 
 template <typename Dtype>
@@ -257,12 +314,14 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
   if (is_1x1_) {
     col_buff = input;
   }
+ 
   for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_ / group_,
-        conv_out_spatial_dim_, conv_out_channels_ / group_,
-        (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
-        (Dtype)0., col_buff + col_offset_ * g);
+      caffe_gpu_gemm < Dtype> (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_
+            / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype) 1., weights, weight_offset_
+            * g, output, top_offset_ + output_offset_ * g, (Dtype) 0., col_buff, is_1x1_ * bottom_offset_ + col_offset_
+            * g);
   }
+  
   if (!is_1x1_) {
     conv_col2im_gpu(col_buff, input);
   }
@@ -277,22 +336,118 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
     col_buff = col_buffer_.gpu_data();
   }
   for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
-        kernel_dim_ / group_, conv_out_spatial_dim_,
-        (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
-        (Dtype)1., weights + weight_offset_ * g);
+    caffe_gpu_gemm < Dtype
+        > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_
+            / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype) 1., output, top_offset_ + output_offset_*g, (Dtype*) col_buff, is_1x1_*bottom_offset_ + col_offset_ * g, (Dtype) 1., (Dtype*) weights, weight_offset_ * g);
   }
 }
 
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
     const Dtype* input) {
-  caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
-      input, bias_multiplier_.gpu_data(), 1., bias);
+  caffe_gpu_gemv < Dtype
+      > (CblasNoTrans, num_output_, N_, (Dtype) 1., input, top_offset_, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, bias, (size_t) 0, 1);
+}
+
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::forward_gpu_gemm_opt(const Dtype* input,
+    const Dtype* weight, Dtype* output, bool skip_im2col) {
+    cl_command_queue Queue;
+    if (!skip_im2col) {
+      conv_im2col_gpu_opt(input);
+    }
+#ifdef multiQ
+  for (int g = 0; g < group_; ++g) {
+    if(g == 0) Queue = amdDevice.CommandQueue;
+    else Queue = amdDevice.CommandQueue_helper;
+    caffe_gpu_gemm<Dtype>(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_,
+        (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g,
+        (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g);
+  }
+  if(group_ == 2) {
+    clFinish(amdDevice.CommandQueue);
+    clFinish(amdDevice.CommandQueue_helper);
+  }
+#else
+  Queue = amdDevice.CommandQueue;
+  for (int g = 0; g < group_; ++g) {
+    caffe_gpu_gemm < Dtype
+        > (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype) 1., weight, weight_offset_
+            * g, (Dtype*) transMem, col_offset_ * g, (Dtype) 0., (Dtype*) subTopMem, top_offset_opt
+            * g);
+  }
+#endif
+  transform_gpu((Dtype*) subTopMem, output, top_offset_, N_, M_ * group_,
+      opt_num2);
+}
+
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::forward_gpu_bias_opt(Dtype* output,
+    const Dtype* bias) {
+  for (int z = 0; z < opt_num2; z++)
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype) 1., bias, 0, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), 0, (Dtype) 1., output, top_offset_
+            + num_output_ * N_ * z);
+}
+
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::backward_gpu_gemm_opt(const Dtype* output,
+    const Dtype* weights, Dtype* input) {
+  cl_command_queue Queue;
+  for (int g = 0; g < group_; ++g) {
+#ifdef multiQ
+    if(g == 0) Queue = amdDevice.CommandQueue;
+    else Queue = amdDevice.CommandQueue_helper;
+#else
+    Queue = amdDevice.CommandQueue;
+#endif
+    caffe_gpu_gemm < Dtype
+        > (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_, (Dtype) 1., weights, weight_offset_
+            * g, (Dtype*) subTopMem, top_offset_opt * g, (Dtype) 0., (Dtype*) transMem, col_offset_
+            * g);
+  }
+#ifdef multiQ
+  if(group_ ==2) {
+    clFinish(amdDevice.CommandQueue);
+    clFinish(amdDevice.CommandQueue_helper);
+  }
+#endif
+
+    conv_col2im_gpu_opt(input);
+}
+
+template <typename Dtype>
+void BaseConvolutionLayer<Dtype>::weight_gpu_gemm_opt(const Dtype* input,
+    const Dtype* output, Dtype* weights) {
+  cl_command_queue Queue;
+   conv_im2col_gpu_opt(input);
+   opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0,
+      opt_num2);
+
+  for (int g = 0; g < group_; ++g) {
+#ifdef multiQ
+    if(g == 0) Queue = amdDevice.CommandQueue;
+    else Queue = amdDevice.CommandQueue_helper;
+#else
+    Queue = amdDevice.CommandQueue;
+#endif
+    caffe_gpu_gemm < Dtype
+        > (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, (Dtype) 1., (Dtype*) subTopMem, top_offset_opt
+            * g, (Dtype*) transMem, col_offset_ * g, (Dtype) 1., (Dtype*) weights, weight_offset_
+            * g);
+#ifdef multiQ
+    if(group_ == 2) {
+      clFinish(amdDevice.CommandQueue);
+      clFinish(amdDevice.CommandQueue_helper);
+    }
+#endif
+  }
 }
 
+// end: code is modified for OpenCL
 #endif  // !CPU_ONLY
 
-INSTANTIATE_CLASS(BaseConvolutionLayer);
+INSTANTIATE_CLASS (BaseConvolutionLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 26a11182..ff4436a7 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -3,18 +3,18 @@
 
 #include "caffe/data_layers.hpp"
 #include "caffe/util/io.hpp"
+#include "caffe/util/benchmark.hpp"
 
 namespace caffe {
 
 template <typename Dtype>
 BaseDataLayer<Dtype>::BaseDataLayer(const LayerParameter& param)
-    : Layer<Dtype>(param),
-      transform_param_(param.transform_param()) {
+    : Layer<Dtype>(param), transform_param_(param.transform_param()) {
 }
 
 template <typename Dtype>
 void BaseDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   if (top.size() == 1) {
     output_labels_ = false;
   } else {
@@ -30,7 +30,7 @@ void BaseDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  BaseDataLayer<Dtype>::LayerSetUp(bottom, top);
+  BaseDataLayer < Dtype > ::LayerSetUp(bottom, top);
   // Now, start the prefetch thread. Before calling prefetch, we make two
   // cpu_data calls so that the prefetch thread does not accidentally make
   // simultaneous cudaMalloc calls when the main thread is running. In some
@@ -60,30 +60,62 @@ void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   // First, join the thread
   JoinPrefetchThread();
+
   DLOG(INFO) << "Thread joined";
   // Reshape to loaded data.
   top[0]->ReshapeLike(prefetch_data_);
   // Copy the data
   caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
-             top[0]->mutable_cpu_data());
+      top[0]->mutable_cpu_data());
   DLOG(INFO) << "Prefetch copied";
   if (this->output_labels_) {
     // Reshape to loaded labels.
     top[1]->ReshapeLike(prefetch_label_);
     // Copy the labels.
     caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
-               top[1]->mutable_cpu_data());
+        top[1]->mutable_cpu_data());
   }
   // Start a new prefetch thread
   DLOG(INFO) << "CreatePrefetchThread";
   CreatePrefetchThread();
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+
+template <typename Dtype>
+void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+
+  JoinPrefetchThread();
+  DLOG(INFO) << "Thread joined";
+
+  top[0]->ReshapeLike(this->prefetch_data_);
+  OCL_CHECK(
+      clEnqueueWriteBuffer(amdDevice.CommandQueue,
+          (cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0,
+          sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0,
+          NULL, NULL));
+  DLOG(INFO) << "Prefetch copied";
+  if (this->output_labels_) {
+    // Reshape to loaded labels.
+    top[1]->ReshapeLike(prefetch_label_);
+    OCL_CHECK(
+        clEnqueueWriteBuffer(amdDevice.CommandQueue,
+            (cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0,
+            sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(),
+            0, NULL, NULL));
+  }
+
+  // Start a new prefetch thread
+  DLOG(INFO) << "CreatePrefetchThread";
+  CreatePrefetchThread();
+}
+
+#else
 STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward);
 #endif
 
-INSTANTIATE_CLASS(BaseDataLayer);
-INSTANTIATE_CLASS(BasePrefetchingDataLayer);
+INSTANTIATE_CLASS (BaseDataLayer);
+INSTANTIATE_CLASS (BasePrefetchingDataLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/base_data_layer.cu
deleted file mode 100644
index 9335a5bc..00000000
--- a/src/caffe/layers/base_data_layer.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <vector>
-
-#include "caffe/data_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, join the thread
-  JoinPrefetchThread();
-  // Reshape to loaded data.
-  top[0]->ReshapeLike(this->prefetch_data_);
-  // Copy the data
-  caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
-      top[0]->mutable_gpu_data());
-  if (this->output_labels_) {
-    // Reshape to loaded labels.
-    top[1]->ReshapeLike(prefetch_label_);
-    // Copy the labels.
-    caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
-        top[1]->mutable_gpu_data());
-  }
-  // Start a new prefetch thread
-  CreatePrefetchThread();
-}
-
-INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp
index 9ba0ea9a..68a19265 100644
--- a/src/caffe/layers/bnll_layer.cpp
+++ b/src/caffe/layers/bnll_layer.cpp
@@ -15,16 +15,16 @@ void BNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   Dtype* top_data = top[0]->mutable_cpu_data();
   const int count = bottom[0]->count();
   for (int i = 0; i < count; ++i) {
-    top_data[i] = bottom_data[i] > 0 ?
-        bottom_data[i] + log(1. + exp(-bottom_data[i])) :
-        log(1. + exp(bottom_data[i]));
+    top_data[i] =
+        bottom_data[i] > 0 ?
+            bottom_data[i] + log(1. + exp(-bottom_data[i])) :
+            log(1. + exp(bottom_data[i]));
   }
 }
 
 template <typename Dtype>
 void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[0]) {
     const Dtype* bottom_data = bottom[0]->cpu_data();
     const Dtype* top_diff = top[0]->cpu_diff();
@@ -38,11 +38,37 @@ void BNLLLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  BNLLForward(count, bottom_data, top_data);
+}
+
+template <typename Dtype>
+void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    BNLLBackward(count, top_diff, bottom_data, bottom_diff);
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(BNLLLayer);
 #endif
 
-INSTANTIATE_CLASS(BNLLLayer);
-REGISTER_LAYER_CLASS(BNLL);
+INSTANTIATE_CLASS (BNLLLayer);
+REGISTER_LAYER_CLASS (BNLL);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/bnll_layer.cu b/src/caffe/layers/bnll_layer.cu
deleted file mode 100644
index d963d068..00000000
--- a/src/caffe/layers/bnll_layer.cu
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-const float kBNLL_THRESHOLD = 50.;
-
-template <typename Dtype>
-__global__ void BNLLForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > 0 ?
-        in[index] + log(1. + exp(-in[index])) :
-        log(1. + exp(in[index]));
-  }
-}
-
-template <typename Dtype>
-void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  BNLLForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-__global__ void BNLLBackward(const int n, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    Dtype expval = exp(min(in_data[index], Dtype(kBNLL_THRESHOLD)));
-    out_diff[index] = in_diff[index] * expval / (expval + 1.);
-  }
-}
-
-template <typename Dtype>
-void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    BNLLBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, bottom_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(BNLLLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 1cac8fc3..5def30d4 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -8,7 +8,7 @@ namespace caffe {
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const ConcatParameter& concat_param = this->layer_param_.concat_param();
   CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim()))
       << "Either axis or concat_dim should be specified; not both.";
@@ -16,7 +16,7 @@ void ConcatLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const int num_axes = bottom[0]->num_axes();
   const ConcatParameter& concat_param = this->layer_param_.concat_param();
   if (concat_param.has_concat_dim()) {
@@ -39,7 +39,9 @@ void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     CHECK_EQ(num_axes, bottom[i]->num_axes())
         << "All inputs must have the same #axes.";
     for (int j = 0; j < num_axes; ++j) {
-      if (j == concat_axis_) { continue; }
+      if (j == concat_axis_) {
+        continue;
+      }
       CHECK_EQ(top_shape[j], bottom[i]->shape(j))
           << "All inputs must have the same shape, except at concat_axis.";
     }
@@ -52,7 +54,7 @@ void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   Dtype* top_data = top[0]->mutable_cpu_data();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
@@ -62,8 +64,9 @@ void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     for (int n = 0; n < num_concats_; ++n) {
       caffe_copy(bottom_concat_axis * concat_input_size_,
           bottom_data + n * bottom_concat_axis * concat_input_size_,
-          top_data + (n * top_concat_axis + offset_concat_axis)
-              * concat_input_size_);
+          top_data
+              + (n * top_concat_axis + offset_concat_axis)
+                  * concat_input_size_);
     }
     offset_concat_axis += bottom_concat_axis;
   }
@@ -71,28 +74,78 @@ void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
   for (int i = 0; i < bottom.size(); ++i) {
-    if (!propagate_down[i]) { continue; }
+    if (!propagate_down[i]) {
+      continue;
+    }
     Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
     const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
     for (int n = 0; n < num_concats_; ++n) {
-      caffe_copy(bottom_concat_axis * concat_input_size_, top_diff +
-          (n * top_concat_axis + offset_concat_axis) * concat_input_size_,
+      caffe_copy(bottom_concat_axis * concat_input_size_,
+          top_diff
+              + (n * top_concat_axis + offset_concat_axis) * concat_input_size_,
           bottom_diff + n * bottom_concat_axis * concat_input_size_);
     }
     offset_concat_axis += bottom_concat_axis;
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  if (bottom.size() == 1) {
+    return;
+  }
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  const bool kForward = true;
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
+    const int nthreads = bottom_concat_size * num_concats_;
+    Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
+        top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
+    offset_concat_axis += bottom_concat_axis;
+  }
+}
+
+template <typename Dtype>
+void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (bottom.size() == 1) {
+    return;
+  }
+  const Dtype* top_diff = top[0]->gpu_diff();
+  int offset_concat_axis = 0;
+  const int top_concat_axis = top[0]->shape(concat_axis_);
+  const bool kForward = false;
+  for (int i = 0; i < bottom.size(); ++i) {
+    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
+    if (propagate_down[i]) {
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
+      const int nthreads = bottom_concat_size * num_concats_;
+      Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_,
+          top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
+    }
+    offset_concat_axis += bottom_concat_axis;
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(ConcatLayer);
 #endif
 
-INSTANTIATE_CLASS(ConcatLayer);
-REGISTER_LAYER_CLASS(Concat);
+INSTANTIATE_CLASS (ConcatLayer);
+REGISTER_LAYER_CLASS (Concat);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu
deleted file mode 100644
index 8f2e85d8..00000000
--- a/src/caffe/layers/concat_layer.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void Concat(const int nthreads, const Dtype* in_data,
-    const bool forward, const int num_concats, const int concat_size,
-    const int top_concat_axis, const int bottom_concat_axis,
-    const int offset_concat_axis, Dtype* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int total_concat_size = concat_size * bottom_concat_axis;
-    const int concat_num = index / total_concat_size;
-    const int concat_index = index % total_concat_size;
-    const int top_index = concat_index +
-        (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
-    if (forward) {
-      out_data[top_index] = in_data[index];
-    } else {
-      out_data[index] = in_data[top_index];
-    }
-  }
-}
-
-template <typename Dtype>
-void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  const bool kForward = true;
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-    const int nthreads = bottom_concat_size * num_concats_;
-    Concat<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, bottom_data, kForward, num_concats_, concat_input_size_,
-        top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
-    offset_concat_axis += bottom_concat_axis;
-  }
-}
-
-template <typename Dtype>
-void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  int offset_concat_axis = 0;
-  const int top_concat_axis = top[0]->shape(concat_axis_);
-  const bool kForward = false;
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (!propagate_down[i]) { continue; }
-    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-    const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-    const int nthreads = bottom_concat_size * num_concats_;
-    Concat<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, top_diff, kForward, num_concats_, concat_input_size_,
-        top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
-    offset_concat_axis += bottom_concat_axis;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ConcatLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index 25e16781..3410b927 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -9,9 +9,9 @@
 namespace caffe {
 
 template <typename Dtype>
-void ContrastiveLossLayer<Dtype>::LayerSetUp(
-  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::LayerSetUp(bottom, top);
+void ContrastiveLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::LayerSetUp(bottom, top);
   CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
   CHECK_EQ(bottom[0]->height(), 1);
   CHECK_EQ(bottom[0]->width(), 1);
@@ -31,12 +31,9 @@ void ContrastiveLossLayer<Dtype>::LayerSetUp(
 
 template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   int count = bottom[0]->count();
-  caffe_sub(
-      count,
-      bottom[0]->cpu_data(),  // a
+  caffe_sub(count, bottom[0]->cpu_data(),  // a
       bottom[1]->cpu_data(),  // b
       diff_.mutable_cpu_data());  // a_i-b_i
   const int channels = bottom[0]->channels();
@@ -46,7 +43,7 @@ void ContrastiveLossLayer<Dtype>::Forward_cpu(
   Dtype loss(0.0);
   for (int i = 0; i < bottom[0]->num(); ++i) {
     dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels,
-        diff_.cpu_data() + (i*channels), diff_.cpu_data() + (i*channels));
+        diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels));
     if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
       loss += dist_sq_.cpu_data()[i];
     } else {  // dissimilar pairs
@@ -54,7 +51,7 @@ void ContrastiveLossLayer<Dtype>::Forward_cpu(
         loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
       } else {
         Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
-        loss += dist*dist;
+        loss += dist * dist;
       }
     }
   }
@@ -71,19 +68,15 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   for (int i = 0; i < 2; ++i) {
     if (propagate_down[i]) {
       const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
-          static_cast<Dtype>(bottom[i]->num());
+      const Dtype alpha = sign * top[0]->cpu_diff()[0]
+          / static_cast<Dtype>(bottom[i]->num());
       int num = bottom[i]->num();
       int channels = bottom[i]->channels();
       for (int j = 0; j < num; ++j) {
         Dtype* bout = bottom[i]->mutable_cpu_diff();
         if (static_cast<int>(bottom[2]->cpu_data()[j])) {  // similar pairs
-          caffe_cpu_axpby(
-              channels,
-              alpha,
-              diff_.cpu_data() + (j*channels),
-              Dtype(0.0),
-              bout + (j*channels));
+          caffe_cpu_axpby(channels, alpha, diff_.cpu_data() + (j * channels),
+              Dtype(0.0), bout + (j * channels));
         } else {  // dissimilar pairs
           Dtype mdist(0.0);
           Dtype beta(0.0);
@@ -96,14 +89,10 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
             beta = -alpha * mdist / (dist + Dtype(1e-4));
           }
           if (mdist > Dtype(0.0)) {
-            caffe_cpu_axpby(
-                channels,
-                beta,
-                diff_.cpu_data() + (j*channels),
-                Dtype(0.0),
-                bout + (j*channels));
+            caffe_cpu_axpby(channels, beta, diff_.cpu_data() + (j * channels),
+                Dtype(0.0), bout + (j * channels));
           } else {
-            caffe_set(channels, Dtype(0), bout + (j*channels));
+            caffe_set(channels, Dtype(0), bout + (j * channels));
           }
         }
       }
@@ -111,11 +100,69 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+// begin: code modified for OpenCL port
+#ifndef CPU_ONLY
+template <typename Dtype>
+void ContrastiveLossLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const int count = bottom[0]->count();
+  caffe_gpu_sub(count, bottom[0]->gpu_data(),  // a
+      bottom[1]->gpu_data(),  // b
+      diff_.mutable_gpu_data());  // a_i-b_i
+  caffe_gpu_powx(count, diff_.mutable_gpu_data(),  // a_i-b_i
+      Dtype(2), diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
+  caffe_gpu_gemv(CblasNoTrans, bottom[0]->num(), bottom[0]->channels(),
+      Dtype(1.0), diff_sq_.gpu_data(),  // (a_i-b_i)^2
+      summer_vec_.gpu_data(), Dtype(0.0), dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2
+  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+  bool legacy_version =
+      this->layer_param_.contrastive_loss_param().legacy_version();
+  Dtype loss(0.0);
+  for (int i = 0; i < bottom[0]->num(); ++i) {
+    if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
+      loss += dist_sq_.cpu_data()[i];
+    } else {  // dissimilar pairs
+      if (legacy_version) {
+        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
+      } else {
+        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
+        loss += dist * dist;
+      }
+    }
+  }
+  loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
+}
+
+template <typename Dtype>
+void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const int count = bottom[0]->count();
+      const int channels = bottom[0]->channels();
+      Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+      const bool legacy_version =
+          this->layer_param_.contrastive_loss_param().legacy_version();
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0]
+          / static_cast<Dtype>(bottom[0]->num());
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      CLLBackward(count, channels, margin, legacy_version, alpha,
+          bottom[2]->gpu_data(),  // pair similarity 0 or 1
+          diff_.gpu_data(),  // the cached eltwise difference between a and b
+          dist_sq_.gpu_data(),  // the cached square distance between a and b
+          bottom[i]->mutable_gpu_diff());
+    }
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(ContrastiveLossLayer);
 #endif
 
-INSTANTIATE_CLASS(ContrastiveLossLayer);
-REGISTER_LAYER_CLASS(ContrastiveLoss);
+INSTANTIATE_CLASS (ContrastiveLossLayer);
+REGISTER_LAYER_CLASS (ContrastiveLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/contrastive_loss_layer.cu b/src/caffe/layers/contrastive_loss_layer.cu
deleted file mode 100644
index 93123931..00000000
--- a/src/caffe/layers/contrastive_loss_layer.cu
+++ /dev/null
@@ -1,111 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ContrastiveLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  caffe_gpu_sub(
-      count,
-      bottom[0]->gpu_data(),  // a
-      bottom[1]->gpu_data(),  // b
-      diff_.mutable_gpu_data());  // a_i-b_i
-  caffe_gpu_powx(
-      count,
-      diff_.mutable_gpu_data(),  // a_i-b_i
-      Dtype(2),
-      diff_sq_.mutable_gpu_data());  // (a_i-b_i)^2
-  caffe_gpu_gemv(
-      CblasNoTrans,
-      bottom[0]->num(),
-      bottom[0]->channels(),
-      Dtype(1.0),
-      diff_sq_.gpu_data(),  // (a_i-b_i)^2
-      summer_vec_.gpu_data(),
-      Dtype(0.0),
-      dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
-  Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-  bool legacy_version =
-      this->layer_param_.contrastive_loss_param().legacy_version();
-  Dtype loss(0.0);
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
-      loss += dist_sq_.cpu_data()[i];
-    } else {  // dissimilar pairs
-      if (legacy_version) {
-        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
-      } else {
-        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]),
-                              Dtype(0.0));
-        loss += dist*dist;
-      }
-    }
-  }
-  loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
-}
-
-template <typename Dtype>
-__global__ void CLLBackward(const int count, const int channels,
-    const Dtype margin, const bool legacy_version, const Dtype alpha,
-    const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
-    Dtype *bottom_diff) {
-  CUDA_KERNEL_LOOP(i, count) {
-    int n = i / channels;  // the num index, to access y and dist_sq
-    if (static_cast<int>(y[n])) {  // similar pairs
-      bottom_diff[i] = alpha * diff[i];
-    } else {  // dissimilar pairs
-      Dtype mdist(0.0);
-      Dtype beta(0.0);
-      if (legacy_version) {
-        mdist = (margin - dist_sq[n]);
-        beta = -alpha;
-      } else {
-        Dtype dist = sqrt(dist_sq[n]);
-        mdist = (margin - dist);
-        beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
-      }
-      if (mdist > 0.0) {
-        bottom_diff[i] = beta;
-      } else {
-        bottom_diff[i] = 0;
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const int count = bottom[0]->count();
-      const int channels = bottom[0]->channels();
-      Dtype margin = this->layer_param_.contrastive_loss_param().margin();
-      const bool legacy_version =
-          this->layer_param_.contrastive_loss_param().legacy_version();
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] /
-          static_cast<Dtype>(bottom[0]->num());
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      CLLBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-          count, channels, margin, legacy_version, alpha,
-          bottom[2]->gpu_data(),  // pair similarity 0 or 1
-          diff_.gpu_data(),  // the cached eltwise difference between a and b
-          dist_sq_.gpu_data(),  // the cached square distance between a and b
-          bottom[i]->mutable_gpu_diff());
-      CUDA_POST_KERNEL_CHECK;
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ContrastiveLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 928ef5ee..4bfd4dba 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -1,5 +1,4 @@
 #include <vector>
-
 #include "caffe/filler.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/util/im2col.hpp"
@@ -18,7 +17,7 @@ void ConvolutionLayer<Dtype>::compute_output_shape() {
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const Dtype* weight = this->blobs_[0]->cpu_data();
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->cpu_data();
@@ -32,11 +31,13 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       }
     }
   }
+
+  // CHECK_BLOB_DATA(top[0],20, "top[0]");
 }
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->cpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
   for (int i = 0; i < top.size(); ++i) {
@@ -65,12 +66,171 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       }
     }
   }
+
+}
+
+// begin: code modified for OpenCL port
+#ifndef CPU_ONLY
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  //if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1)
+    //Forward_gpu_batched(bottom, top);
+  //else
+    Forward_gpu_org(bottom, top);
+}
+
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  //if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1)
+    Backward_gpu_batched(top, propagate_down, bottom);
+  //else
+    //Backward_gpu_org(top, propagate_down, bottom);
+}
+
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Forward_gpu_batched(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    //CHECK_BLOB_DATA(bottom[i],10,"bottom");
+
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    this->opt_num2 = global_packing_N;
+    this->weight_offset_ = this->M_ * this->K_;
+    for (int n = 0; n < this->num_; n += this->opt_num2) {
+      this->opt_num2 =
+          this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2;
+      //intermediate variables to pass offset
+      this->top_offset_opt = this->M_ * this->N_ * this->opt_num2;
+      this->top_offset_ = top[i]->offset(n);
+      this->col_offset_ = this->K_ * this->N_ * this->opt_num2;
+      this->bottom_offset_ = bottom[i]->offset(n);
+      this->forward_gpu_gemm_opt(bottom_data, weight, top_data);
+      if (this->bias_term_) {
+        const Dtype* bias = this->blobs_[1]->gpu_data();
+        this->forward_gpu_bias_opt(top_data, bias);
+      }
+    }
+  }
+
+  //CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+  //CHECK_BLOB_DATA(top[0],20, "top[0]");
+
+}
+
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Forward_gpu_org(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+   const Dtype* weight = this->blobs_[0]->gpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    for (int n = 0; n < this->num_; ++n) {
+      //two intermediate variables to pass offset
+      this->bottom_offset_ = bottom[i]->offset(n);
+      this->top_offset_ = top[i]->offset(n);
+      this->col_offset_ = this->K_ * this->N_;
+      this->forward_gpu_gemm(bottom_data, weight, top_data);
+
+      if (this->bias_term_) {
+        const Dtype* bias = this->blobs_[1]->gpu_data();
+        this->forward_gpu_bias(top_data, bias);
+      }
+    }
+  }
+
+  // CHECK_BLOB_DATA(this->blobs_[0],20, "weights");
+  //CHECK_BLOB_DATA(top[0],20, "top[0]");
+}
+
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Backward_gpu_batched(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        this->top_offset_ = top[i]->offset(n);
+        this->backward_gpu_bias(bias_diff, top_diff);
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      this->weight_offset_ = this->M_ * this->K_;
+      this->opt_num2 = global_packing_N;
+      for (int n = 0; n < this->num_; n += this->opt_num2) {
+        this->opt_num2 =
+            this->opt_num2 > (this->num_ - n) ?
+                (this->num_ - n) : this->opt_num2;
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        this->col_offset_ = this->K_ * (this->N_ * this->opt_num2);
+        this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2);
+        // gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          this->weight_gpu_gemm_opt(bottom_data, top_diff, weight_diff);
+        }
+        // gradient w.r.t. bottom data, if necessary.
+        if (propagate_down[i]) {
+          this->backward_gpu_gemm_opt(top_diff, weight, bottom_diff);
+        }
+      }
+    }
+  }
+}
+template <typename Dtype>
+void ConvolutionLayer<Dtype>::Backward_gpu_org(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        //
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        this->col_offset_ = this->K_ * this->N_;
+        this->backward_gpu_bias(bias_diff, top_diff);
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        this->top_offset_ = top[i]->offset(n);
+        this->bottom_offset_ = bottom[i]->offset(n);
+        // gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          this->weight_gpu_gemm(bottom_data, top_diff, weight_diff);
+        }
+        // gradient w.r.t. bottom data, if necessary.
+        if (propagate_down[i]) {
+          this->backward_gpu_gemm(top_diff, weight, bottom_diff);
+        }
+      }
+    }
+  }
+
 }
+// end: code modified for OpenCL port
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(ConvolutionLayer);
 #endif
 
-INSTANTIATE_CLASS(ConvolutionLayer);
+INSTANTIATE_CLASS (ConvolutionLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
deleted file mode 100644
index b8a98ff7..00000000
--- a/src/caffe/layers/conv_layer.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-      this->forward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(bottom_data + bottom[i]->offset(n),
-              top_diff + top[i]->offset(n), weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->backward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ConvolutionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
deleted file mode 100644
index 104d2b9d..00000000
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-// Set to three for the benefit of the backward pass, which
-// can use separate streams for calculating the gradient w.r.t.
-// bias, filter weights, and bottom data for each group independently
-#define CUDNN_STREAMS_PER_GROUP 3
-
-/**
- * TODO(dox) explain cuDNN interface
- */
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
-  // Initialize CUDA streams and cuDNN.
-  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
-  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
-  workspaceSizeInBytes = 0;
-  workspace = NULL;
-
-  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
-    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
-    CUDNN_CHECK(cudnnCreate(&handle_[g]));
-    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
-  }
-
-  // Set the indexing parameters.
-  weight_offset_ = (this->num_output_ / this->group_)
-      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
-  bias_offset_ = (this->num_output_ / this->group_);
-
-  // Create filter descriptor.
-  cudnn::createFilterDesc<Dtype>(&filter_desc_,
-      this->num_output_ / this->group_, this->channels_ / this->group_,
-      this->kernel_h_, this->kernel_w_);
-
-  // Create tensor descriptor(s) for data and corresponding convolution(s).
-  for (int i = 0; i < bottom.size(); i++) {
-    cudnnTensorDescriptor_t bottom_desc;
-    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
-    bottom_descs_.push_back(bottom_desc);
-    cudnnTensorDescriptor_t top_desc;
-    cudnn::createTensor4dDesc<Dtype>(&top_desc);
-    top_descs_.push_back(top_desc);
-    cudnnConvolutionDescriptor_t conv_desc;
-    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
-    conv_descs_.push_back(conv_desc);
-  }
-
-  // Tensor descriptor for bias.
-  if (this->bias_term_) {
-    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
-  }
-
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  ConvolutionLayer<Dtype>::Reshape(bottom, top);
-  bottom_offset_ = (this->channels_ / this->group_)
-      * this->height_ * this->width_;
-  top_offset_ = (this->num_output_ / this->group_)
-      * this->height_out_ * this->width_out_;
-
-  for (int i = 0; i < bottom.size(); i++) {
-    cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
-        this->num_,
-        this->channels_ / this->group_,
-        this->height_, this->width_,
-        this->channels_ * this->height_ * this->width_,
-        this->height_ * this->width_,
-        this->width_, 1);
-    cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
-        this->num_,
-        this->num_output_ / this->group_,
-        this->height_out_, this->width_out_,
-        this->num_output_ * this->height_out_ * this->width_out_,
-        this->height_out_ * this->width_out_,
-        this->width_out_, 1);
-    cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
-        filter_desc_, this->pad_h_, this->pad_w_,
-        this->stride_h_, this->stride_w_);
-  }
-
-  // Tensor descriptor for bias.
-  if (this->bias_term_) {
-    cudnn::setTensor4dDesc<Dtype>(&bias_desc_,
-        1, this->num_output_ / this->group_, 1, 1);
-  }
-}
-
-template <typename Dtype>
-CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  for (int i = 0; i < bottom_descs_.size(); i++) {
-    cudnnDestroyTensorDescriptor(bottom_descs_[i]);
-    cudnnDestroyTensorDescriptor(top_descs_[i]);
-    cudnnDestroyConvolutionDescriptor(conv_descs_[i]);
-  }
-  if (this->bias_term_) {
-    cudnnDestroyTensorDescriptor(bias_desc_);
-  }
-  cudnnDestroyFilterDescriptor(filter_desc_);
-
-  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
-    cudaStreamDestroy(stream_[g]);
-    cudnnDestroy(handle_[g]);
-  }
-
-  delete [] stream_;
-  delete [] handle_;
-}
-
-INSTANTIATE_CLASS(CuDNNConvolutionLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
deleted file mode 100644
index b4e802e1..00000000
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ /dev/null
@@ -1,160 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-__global__ void sync_conv_groups() { }
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    const Dtype* weight = this->blobs_[0]->gpu_data();
-
-    size_t workspace_limit_bytes = this->kernel_h_ *
-                                   this->kernel_w_ *
-                                   this->channels_ *
-                                   sizeof(int) + 1;
-
-    // Forward through cuDNN in parallel over groups.
-    for (int g = 0; g < this->group_; g++) {
-      cudnnConvolutionFwdAlgo_t algo;
-
-      // pick the convolution algorithm
-      // TODO(shelhamer) this should be done during reshape
-      // TODO(shelhamer) the choice of automatic or manual algorithm picking
-      // should be exposed in proto
-      CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[g],
-        bottom_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        top_descs_[i],
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_limit_bytes,  // memoryLimitInBytes,
-        &algo));
-
-      // get minimum size of the workspace needed for the desired algorithm
-      size_t workspaceSizeInBytes_temp = 0;
-
-      CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[g],
-        bottom_descs_[i],
-        filter_desc_,
-        conv_descs_[i],
-        top_descs_[i],
-        algo,
-        &workspaceSizeInBytes_temp));
-
-      if (workspaceSizeInBytes_temp > workspaceSizeInBytes) {
-        workspaceSizeInBytes = workspaceSizeInBytes_temp;
-        // free the existing workspace and allocate a new (larger) one
-        cudaFree(this->workspace);
-        cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes);
-        if (err != cudaSuccess) {
-          // force zero memory path
-          algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-          workspace = NULL;
-          workspaceSizeInBytes = 0;
-        }
-      }
-
-      // Filters.
-      CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
-            cudnn::dataType<Dtype>::one,
-            bottom_descs_[i], bottom_data + bottom_offset_ * g,
-            filter_desc_, weight + weight_offset_ * g,
-            conv_descs_[i],
-            algo, workspace, workspaceSizeInBytes,
-            cudnn::dataType<Dtype>::zero,
-            top_descs_[i], top_data + top_offset_ * g));
-
-      // Bias.
-      if (this->bias_term_) {
-        const Dtype* bias_data = this->blobs_[1]->gpu_data();
-        CUDNN_CHECK(cudnnAddTensor(handle_[g], CUDNN_ADD_SAME_C,
-              cudnn::dataType<Dtype>::one,
-              bias_desc_, bias_data + bias_offset_ * g,
-              cudnn::dataType<Dtype>::one,
-              top_descs_[i], top_data + top_offset_ * g));
-      }
-    }
-
-    // Synchronize the work across groups, each of which went into its own
-    // stream, by launching an empty kernel into the default (null) stream.
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    sync_conv_groups<<<1, 1>>>();
-  }
-}
-
-template <typename Dtype>
-void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = NULL;
-  Dtype* weight_diff = NULL;
-  if (this->param_propagate_down_[0]) {
-    weight = this->blobs_[0]->gpu_data();
-    weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  }
-  Dtype* bias_diff = NULL;
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    bias_diff = this->blobs_[1]->mutable_gpu_diff();
-  }
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    // Backward through cuDNN in parallel over groups and gradients.
-    for (int g = 0; g < this->group_; g++) {
-      // Gradient w.r.t. bias.
-      if (this->bias_term_ && this->param_propagate_down_[1]) {
-        CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              top_descs_[i],  top_diff + top_offset_ * g,
-              cudnn::dataType<Dtype>::one,
-              bias_desc_, bias_diff + bias_offset_ * g));
-      }
-
-      // Gradient w.r.t. weights.
-      if (this->param_propagate_down_[0]) {
-        const Dtype* bottom_data = bottom[i]->gpu_data();
-        CUDNN_CHECK(cudnnConvolutionBackwardFilter(handle_[1*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              bottom_descs_[i], bottom_data + bottom_offset_ * g,
-              top_descs_[i],    top_diff + top_offset_ * g,
-              conv_descs_[i],
-              cudnn::dataType<Dtype>::one,
-              filter_desc_, weight_diff + weight_offset_ * g));
-      }
-
-      // Gradient w.r.t. bottom data.
-      if (propagate_down[i]) {
-        if (weight == NULL) {
-          weight = this->blobs_[0]->gpu_data();
-        }
-        Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-        CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g],
-              cudnn::dataType<Dtype>::one,
-              filter_desc_, weight + weight_offset_ * g,
-              top_descs_[i], top_diff + top_offset_ * g,
-              conv_descs_[i],
-              cudnn::dataType<Dtype>::zero,
-              bottom_descs_[i], bottom_diff + bottom_offset_ * g));
-      }
-    }
-
-    // Synchronize the work across groups, each of which went into its own
-    // stream, by launching an empty kernel into the default (null) stream.
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    sync_conv_groups<<<1, 1>>>();
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNConvolutionLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_pooling_layer.cpp b/src/caffe/layers/cudnn_pooling_layer.cpp
deleted file mode 100644
index c92c4e47..00000000
--- a/src/caffe/layers/cudnn_pooling_layer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  PoolingLayer<Dtype>::LayerSetUp(bottom, top);
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  cudnn::createPoolingDesc<Dtype>(&pooling_desc_,
-      this->layer_param_.pooling_param().pool(), &mode_,
-      this->kernel_h_, this->kernel_w_, this->pad_h_, this->pad_w_,
-      this->stride_h_, this->stride_w_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  PoolingLayer<Dtype>::Reshape(bottom, top);
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, bottom[0]->num(),
-      this->channels_, this->height_, this->width_);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, bottom[0]->num(),
-      this->channels_, this->pooled_height_, this->pooled_width_);
-}
-
-template <typename Dtype>
-CuDNNPoolingLayer<Dtype>::~CuDNNPoolingLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(bottom_desc_);
-  cudnnDestroyTensorDescriptor(top_desc_);
-  cudnnDestroyPoolingDescriptor(pooling_desc_);
-  cudnnDestroy(handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNPoolingLayer);
-
-}   // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_pooling_layer.cu b/src/caffe/layers/cudnn_pooling_layer.cu
deleted file mode 100644
index a952b855..00000000
--- a/src/caffe/layers/cudnn_pooling_layer.cu
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifdef USE_CUDNN
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnPoolingForward(handle_, pooling_desc_,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnPoolingBackward(handle_, pooling_desc_,
-        cudnn::dataType<Dtype>::one,
-        top_desc_, top_data, top_desc_, top_diff,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNPoolingLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp
deleted file mode 100644
index 759d8398..00000000
--- a/src/caffe/layers/cudnn_relu_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  ReLULayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  ReLULayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNReLULayer<Dtype>::~CuDNNReLULayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNReLULayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_relu_layer.cu b/src/caffe/layers/cudnn_relu_layer.cu
deleted file mode 100644
index 21d14857..00000000
--- a/src/caffe/layers/cudnn_relu_layer.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  // Fallback to standard Caffe for leaky ReLU.
-  if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
-    return ReLULayer<Dtype>::Forward_gpu(bottom, top);
-  }
-
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_RELU,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  // Fallback to standard Caffe for leaky ReLU.
-  if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
-    return ReLULayer<Dtype>::Backward_gpu(top, propagate_down, bottom);
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_RELU,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNReLULayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cpp b/src/caffe/layers/cudnn_sigmoid_layer.cpp
deleted file mode 100644
index 32637873..00000000
--- a/src/caffe/layers/cudnn_sigmoid_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SigmoidLayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SigmoidLayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNSigmoidLayer<Dtype>::~CuDNNSigmoidLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNSigmoidLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cu b/src/caffe/layers/cudnn_sigmoid_layer.cu
deleted file mode 100644
index 7a06cf72..00000000
--- a/src/caffe/layers/cudnn_sigmoid_layer.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_SIGMOID,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNSigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_SIGMOID,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSigmoidLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_softmax_layer.cpp b/src/caffe/layers/cudnn_softmax_layer.cpp
deleted file mode 100644
index 77a3225a..00000000
--- a/src/caffe/layers/cudnn_softmax_layer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SoftmaxLayer<Dtype>::LayerSetUp(bottom, top);
-  // Initialize CUDNN.
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  SoftmaxLayer<Dtype>::Reshape(bottom, top);
-  int N = this->outer_num_;
-  int K = bottom[0]->shape(this->softmax_axis_);
-  int H = this->inner_num_;
-  int W = 1;
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNSoftmaxLayer<Dtype>::~CuDNNSoftmaxLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(bottom_desc_);
-  cudnnDestroyTensorDescriptor(top_desc_);
-  cudnnDestroy(handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNSoftmaxLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_softmax_layer.cu b/src/caffe/layers/cudnn_softmax_layer.cu
deleted file mode 100644
index a9e2fcef..00000000
--- a/src/caffe/layers/cudnn_softmax_layer.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnSoftmaxForward(handle_, CUDNN_SOFTMAX_ACCURATE,
-        CUDNN_SOFTMAX_MODE_CHANNEL,
-        cudnn::dataType<Dtype>::one,
-        bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNSoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-    CUDNN_CHECK(cudnnSoftmaxBackward(handle_, CUDNN_SOFTMAX_ACCURATE,
-          CUDNN_SOFTMAX_MODE_CHANNEL,
-          cudnn::dataType<Dtype>::one,
-          top_desc_, top_data, top_desc_, top_diff,
-          cudnn::dataType<Dtype>::zero,
-          bottom_desc_, bottom_diff));
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSoftmaxLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_tanh_layer.cpp b/src/caffe/layers/cudnn_tanh_layer.cpp
deleted file mode 100644
index 376faad3..00000000
--- a/src/caffe/layers/cudnn_tanh_layer.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  TanHLayer<Dtype>::LayerSetUp(bottom, top);
-  // initialize cuDNN
-  CUDNN_CHECK(cudnnCreate(&handle_));
-  cudnn::createTensor4dDesc<Dtype>(&bottom_desc_);
-  cudnn::createTensor4dDesc<Dtype>(&top_desc_);
-  handles_setup_ = true;
-}
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  TanHLayer<Dtype>::Reshape(bottom, top);
-  const int N = bottom[0]->num();
-  const int K = bottom[0]->channels();
-  const int H = bottom[0]->height();
-  const int W = bottom[0]->width();
-  cudnn::setTensor4dDesc<Dtype>(&bottom_desc_, N, K, H, W);
-  cudnn::setTensor4dDesc<Dtype>(&top_desc_, N, K, H, W);
-}
-
-template <typename Dtype>
-CuDNNTanHLayer<Dtype>::~CuDNNTanHLayer() {
-  // Check that handles have been setup before destroying.
-  if (!handles_setup_) { return; }
-
-  cudnnDestroyTensorDescriptor(this->bottom_desc_);
-  cudnnDestroyTensorDescriptor(this->top_desc_);
-  cudnnDestroy(this->handle_);
-}
-
-INSTANTIATE_CLASS(CuDNNTanHLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/cudnn_tanh_layer.cu b/src/caffe/layers/cudnn_tanh_layer.cu
deleted file mode 100644
index d287f6fe..00000000
--- a/src/caffe/layers/cudnn_tanh_layer.cu
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifdef USE_CUDNN
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  CUDNN_CHECK(cudnnActivationForward(this->handle_,
-        CUDNN_ACTIVATION_TANH,
-        cudnn::dataType<Dtype>::one,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->top_desc_, top_data));
-}
-
-template <typename Dtype>
-void CuDNNTanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  CUDNN_CHECK(cudnnActivationBackward(this->handle_,
-        CUDNN_ACTIVATION_TANH,
-        cudnn::dataType<Dtype>::one,
-        this->top_desc_, top_data, this->top_desc_, top_diff,
-        this->bottom_desc_, bottom_data,
-        cudnn::dataType<Dtype>::zero,
-        this->bottom_desc_, bottom_diff));
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(CuDNNTanHLayer);
-
-}  // namespace caffe
-#endif
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 161a75e0..fdae75a0 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -23,7 +23,7 @@ DataLayer<Dtype>::~DataLayer<Dtype>() {
 
 template <typename Dtype>
 void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   // Initialize DB
   db_.reset(db::GetDB(this->layer_param_.data_param().backend()));
   db_->Open(this->layer_param_.data_param().source(), db::READ);
@@ -31,8 +31,8 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
   // Check if we should randomly skip a few data points
   if (this->layer_param_.data_param().rand_skip()) {
-    unsigned int skip = caffe_rng_rand() %
-                        this->layer_param_.data_param().rand_skip();
+    unsigned int skip = caffe_rng_rand()
+        % this->layer_param_.data_param().rand_skip();
     LOG(INFO) << "Skipping first " << skip << " data points.";
     while (skip-- > 0) {
       cursor_->Next();
@@ -48,6 +48,7 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   top_shape[0] = this->layer_param_.data_param().batch_size();
   this->prefetch_data_.Reshape(top_shape);
   top[0]->ReshapeLike(this->prefetch_data_);
+  this->prefetch_data_.set_data_layer();
 
   LOG(INFO) << "output data size: " << top[0]->num() << ","
       << top[0]->channels() << "," << top[0]->height() << ","
@@ -57,6 +58,7 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
     vector<int> label_shape(1, this->layer_param_.data_param().batch_size());
     top[1]->Reshape(label_shape);
     this->prefetch_label_.Reshape(label_shape);
+    this->prefetch_label_.set_data_layer();
   }
 }
 
@@ -120,7 +122,7 @@ void DataLayer<Dtype>::InternalThreadEntry() {
   DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
 }
 
-INSTANTIATE_CLASS(DataLayer);
-REGISTER_LAYER_CLASS(Data);
+INSTANTIATE_CLASS (DataLayer);
+REGISTER_LAYER_CLASS (Data);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index a4612963..ddf906b7 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -18,7 +18,7 @@ void DeconvolutionLayer<Dtype>::compute_output_shape() {
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const Dtype* weight = this->blobs_[0]->cpu_data();
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->cpu_data();
@@ -36,7 +36,7 @@ void DeconvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->cpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
   for (int i = 0; i < top.size(); ++i) {
@@ -69,11 +69,69 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+
+template <typename Dtype>
+void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    for (int n = 0; n < this->num_; ++n) {
+      this->bottom_offset_ = top[i]->offset(n);
+      this->top_offset_ = bottom[i]->offset(n);
+      this->backward_gpu_gemm(bottom_data, weight, top_data);
+      if (this->bias_term_) {
+        this->top_offset_ = top[i]->offset(n);
+        const Dtype* bias = this->blobs_[1]->gpu_data();
+        this->forward_gpu_bias(top_data, bias);
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+    // Bias gradient, if necessary.
+    if (this->bias_term_ && this->param_propagate_down_[1]) {
+      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+      for (int n = 0; n < this->num_; ++n) {
+        this->top_offset_ = top[i]->offset(n);
+        this->backward_gpu_bias(bias_diff, top_diff);
+      }
+    }
+    if (this->param_propagate_down_[0] || propagate_down[i]) {
+      for (int n = 0; n < this->num_; ++n) {
+        this->top_offset_ = bottom[i]->offset(n);
+        this->bottom_offset_ = top[i]->offset(n);
+        // gradient w.r.t. weight. Note that we will accumulate diffs.
+        if (this->param_propagate_down_[0]) {
+          this->weight_gpu_gemm(top_diff, bottom_data, weight_diff);
+        }
+        // gradient w.r.t. bottom data, if necessary.
+        if (propagate_down[i]) {
+          this->forward_gpu_gemm(top_diff, weight, bottom_diff);
+        }
+      }
+    }
+  }
+}
+// end: code modified for OpenCL port
+
+#else 
 STUB_GPU(DeconvolutionLayer);
 #endif
 
-INSTANTIATE_CLASS(DeconvolutionLayer);
-REGISTER_LAYER_CLASS(Deconvolution);
+INSTANTIATE_CLASS (DeconvolutionLayer);
+REGISTER_LAYER_CLASS (Deconvolution);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu
deleted file mode 100644
index 39bc4de8..00000000
--- a/src/caffe/layers/deconv_layer.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  for (int i = 0; i < bottom.size(); ++i) {
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    for (int n = 0; n < this->num_; ++n) {
-      this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
-      if (this->bias_term_) {
-        const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    const Dtype* bottom_data = bottom[i]->gpu_data();
-    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-    // Bias gradient, if necessary.
-    if (this->bias_term_ && this->param_propagate_down_[1]) {
-      Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
-      for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
-      }
-    }
-    if (this->param_propagate_down_[0] || propagate_down[i]) {
-      for (int n = 0; n < this->num_; ++n) {
-        // gradient w.r.t. weight. Note that we will accumulate diffs.
-        if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(top_diff + top[i]->offset(n),
-              bottom_data + bottom[i]->offset(n), weight_diff);
-        }
-        // gradient w.r.t. bottom data, if necessary.
-        if (propagate_down[i]) {
-          this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(DeconvolutionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp
index ec1256fd..21699414 100644
--- a/src/caffe/layers/dropout_layer.cpp
+++ b/src/caffe/layers/dropout_layer.cpp
@@ -12,19 +12,19 @@ namespace caffe {
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
-  threshold_ = this->layer_param_.dropout_param().dropout_ratio();
-  DCHECK(threshold_ > 0.);
-  DCHECK(threshold_ < 1.);
-  scale_ = 1. / (1. - threshold_);
-  uint_thres_ = static_cast<unsigned int>(UINT_MAX * threshold_);
+		const vector<Blob<Dtype>*>& top) {
+	NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
+	threshold_ = this->layer_param_.dropout_param().dropout_ratio();
+	DCHECK(threshold_ > 0.);
+	DCHECK(threshold_ < 1.);
+	scale_ = 1. / (1. - threshold_);
+	uint_thres_ = static_cast<unsigned int>(UINT_MAX * threshold_);
 }
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::Reshape(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::Reshape(bottom, top);
   // Set up the cache for random number generation
   rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(),
       bottom[0]->height(), bottom[0]->width());
@@ -50,8 +50,7 @@ void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[0]) {
     const Dtype* top_diff = top[0]->cpu_diff();
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
@@ -67,12 +66,52 @@ void DropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  if (this->phase_ == TRAIN) {
+    unsigned int* mask =
+        static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
+    caffe_gpu_rng_uniform(count, mask);
+    // set thresholds
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    DropoutForward(count, bottom_data, mask, uint_thres_, scale_, top_data);
+  } else {
+    if(bottom_data != top_data)
+    caffe_gpu_copy(count, bottom_data, top_data);
+  }
+}
+
+template <typename Dtype>
+void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    if (this->phase_ == TRAIN) {
+      const unsigned int* mask =
+          static_cast<const unsigned int*>(rand_vec_.gpu_data());
+      const int count = bottom[0]->count();
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      DropoutBackward(count, top_diff, mask, uint_thres_, scale_, bottom_diff);
+    } else {
+      if(bottom_diff != top_diff)
+      caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff);
+    }
+  }
+}
+// end: code modified for OpenCL port
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(DropoutLayer);
 #endif
 
-INSTANTIATE_CLASS(DropoutLayer);
-REGISTER_LAYER_CLASS(Dropout);
+INSTANTIATE_CLASS (DropoutLayer);
+REGISTER_LAYER_CLASS (Dropout);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu
deleted file mode 100644
index f9ea04f4..00000000
--- a/src/caffe/layers/dropout_layer.cu
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <algorithm>
-#include <limits>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/syncedmem.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-
-template <typename Dtype>
-__global__ void DropoutForward(const int n, const Dtype* in,
-    const unsigned int* mask, const unsigned int threshold, const float scale,
-    Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] * (mask[index] > threshold) * scale;
-  }
-}
-
-template <typename Dtype>
-void DropoutLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  if (this->phase_ == TRAIN) {
-    unsigned int* mask =
-        static_cast<unsigned int*>(rand_vec_.mutable_gpu_data());
-    caffe_gpu_rng_uniform(count, mask);
-    // set thresholds
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    DropoutForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, mask, uint_thres_, scale_, top_data);
-    CUDA_POST_KERNEL_CHECK;
-  } else {
-    caffe_copy(count, bottom_data, top_data);
-  }
-}
-
-template <typename Dtype>
-__global__ void DropoutBackward(const int n, const Dtype* in_diff,
-    const unsigned int* mask, const unsigned int threshold, const float scale,
-    Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * scale * (mask[index] > threshold);
-  }
-}
-
-template <typename Dtype>
-void DropoutLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    if (this->phase_ == TRAIN) {
-      const unsigned int* mask =
-          static_cast<const unsigned int*>(rand_vec_.gpu_data());
-      const int count = bottom[0]->count();
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      DropoutBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
-        CAFFE_CUDA_NUM_THREADS>>>(
-          count, top_diff, mask, uint_thres_, scale_, bottom_diff);
-      CUDA_POST_KERNEL_CHECK;
-    } else {
-      caffe_copy(top[0]->count(), top_diff, bottom_diff);
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(DropoutLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp
index 6b0d6174..f13f3be1 100644
--- a/src/caffe/layers/dummy_data_layer.cpp
+++ b/src/caffe/layers/dummy_data_layer.cpp
@@ -8,37 +8,38 @@ namespace caffe {
 
 template <typename Dtype>
 void DummyDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const int num_top = top.size();
   const DummyDataParameter& param = this->layer_param_.dummy_data_param();
   const int num_data_filler = param.data_filler_size();
-  CHECK(num_data_filler == 0 || num_data_filler == 1 ||
-        num_data_filler == num_top)
+  CHECK(
+      num_data_filler == 0 || num_data_filler == 1
+          || num_data_filler == num_top)
       << "Number of data fillers must be 0, 1 or equal to the number of tops: "
       << num_top << "; you specified " << num_data_filler << " data fillers.";
 
-  const bool legacy_dims = param.num_size() || param.channels_size() ||
-                           param.height_size() || param.width_size();
+  const bool legacy_dims = param.num_size() || param.channels_size()
+      || param.height_size() || param.width_size();
   if (legacy_dims) {
     CHECK_EQ(0, param.shape_size())
         << "Both shape and legacy fields were specified";
     // Using deprecated 4D output dim specifiers.
     CHECK(param.num_size() == 1 || param.num_size() == num_top)
-        << "Must specify 'num' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.num_size() << ".";
+        << "Must specify 'num' once, or once per top blob " << "(" << num_top
+        << "); specified " << param.num_size() << ".";
     CHECK(param.channels_size() == 1 || param.channels_size() == num_top)
-        << "Must specify 'channels' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.channels_size() << ".";
+        << "Must specify 'channels' once, or once per top blob " << "("
+        << num_top << "); specified " << param.channels_size() << ".";
     CHECK(param.height_size() == 1 || param.height_size() == num_top)
-        << "Must specify 'height' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.height_size() << ".";
+        << "Must specify 'height' once, or once per top blob " << "(" << num_top
+        << "); specified " << param.height_size() << ".";
     CHECK(param.width_size() == 1 || param.width_size() == num_top)
-        << "Must specify 'width' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.width_size() << ".";
+        << "Must specify 'width' once, or once per top blob " << "(" << num_top
+        << "); specified " << param.width_size() << ".";
   } else {
     CHECK(param.shape_size() == 1 || param.shape_size() == num_top)
-        << "Must specify 'shape' once, or once per top blob "
-        << "(" << num_top << "); specified " << param.shape_size() << ".";
+        << "Must specify 'shape' once, or once per top blob " << "(" << num_top
+        << "); specified " << param.shape_size() << ".";
   }
   // refill_[i] tells Forward i whether or not to actually refill top Blob i.
   // If refill_[i] is false, Forward does nothing for Blob i. We use this to
@@ -62,12 +63,12 @@ void DummyDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     refill_.resize(1);
     refill_[0] = (strcmp(filler_param.type().c_str(), "constant") == 0);
     fillers_.resize(1);
-    fillers_[0].reset(GetFiller<Dtype>(filler_param));
+    fillers_[0].reset(GetFiller < Dtype > (filler_param));
   } else {
     refill_.resize(num_top);
     fillers_.resize(num_top);
     for (int i = 0; i < num_top; ++i) {
-      fillers_[i].reset(GetFiller<Dtype>(param.data_filler(i)));
+      fillers_[i].reset(GetFiller < Dtype > (param.data_filler(i)));
       // Refill on each iteration iff not using a constant filler,
       // but use the inverse of this rule for the first run.
       refill_[i] =
@@ -100,7 +101,7 @@ void DummyDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void DummyDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   for (int i = 0; i < top.size(); ++i) {
     const int filler_id = (fillers_.size() > 1) ? i : 0;
     if (refill_[filler_id]) {
@@ -109,7 +110,7 @@ void DummyDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
-INSTANTIATE_CLASS(DummyDataLayer);
-REGISTER_LAYER_CLASS(DummyData);
+INSTANTIATE_CLASS (DummyDataLayer);
+REGISTER_LAYER_CLASS (DummyData);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index a8070073..84cc279c 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -9,17 +9,19 @@ namespace caffe {
 
 template <typename Dtype>
 void EltwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK(this->layer_param().eltwise_param().coeff_size() == 0
-      || this->layer_param().eltwise_param().coeff_size() == bottom.size()) <<
-      "Eltwise Layer takes one coefficient per bottom blob.";
-  CHECK(!(this->layer_param().eltwise_param().operation()
-      == EltwiseParameter_EltwiseOp_PROD
-      && this->layer_param().eltwise_param().coeff_size())) <<
-      "Eltwise layer only takes coefficients for summation.";
+    const vector<Blob<Dtype>*>& top) {
+  CHECK(
+      this->layer_param().eltwise_param().coeff_size() == 0
+          || this->layer_param().eltwise_param().coeff_size() == bottom.size())
+      << "Eltwise Layer takes one coefficient per bottom blob.";
+  CHECK(
+      !(this->layer_param().eltwise_param().operation()
+          == EltwiseParameter_EltwiseOp_PROD
+          && this->layer_param().eltwise_param().coeff_size()))
+      << "Eltwise layer only takes coefficients for summation.";
   op_ = this->layer_param_.eltwise_param().operation();
   // Blob-wise coefficients for the elementwise operation.
-  coeffs_ = vector<Dtype>(bottom.size(), 1);
+  coeffs_ = vector < Dtype > (bottom.size(), 1);
   if (this->layer_param().eltwise_param().coeff_size()) {
     for (int i = 0; i < bottom.size(); ++i) {
       coeffs_[i] = this->layer_param().eltwise_param().coeff(i);
@@ -30,21 +32,21 @@ void EltwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   for (int i = 1; i < bottom.size(); ++i) {
     CHECK(bottom[i]->shape() == bottom[0]->shape());
   }
   top[0]->ReshapeLike(*bottom[0]);
   // If max operation, we will initialize the vector index part.
-  if (this->layer_param_.eltwise_param().operation() ==
-      EltwiseParameter_EltwiseOp_MAX && top.size() == 1) {
+  if (this->layer_param_.eltwise_param().operation()
+      == EltwiseParameter_EltwiseOp_MAX && top.size() == 1) {
     max_idx_.Reshape(bottom[0]->shape());
   }
 }
 
 template <typename Dtype>
-void EltwiseLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+void EltwiseLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
   int* mask = NULL;
   const Dtype* bottom_data_a = NULL;
   const Dtype* bottom_data_b = NULL;
@@ -113,13 +115,14 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
         if (stable_prod_grad_) {
           bool initialized = false;
           for (int j = 0; j < bottom.size(); ++j) {
-            if (i == j) { continue; }
+            if (i == j) {
+              continue;
+            }
             if (!initialized) {
               caffe_copy(count, bottom[j]->cpu_data(), bottom_diff);
               initialized = true;
             } else {
-              caffe_mul(count, bottom[j]->cpu_data(), bottom_diff,
-                        bottom_diff);
+              caffe_mul(count, bottom[j]->cpu_data(), bottom_diff, bottom_diff);
             }
           }
         } else {
@@ -151,11 +154,100 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+// begin: code modified for OpenCL port
+#ifndef CPU_ONLY
+template <typename Dtype>
+void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  int* mask = NULL;
+  const int count = top[0]->count();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  switch (op_) {
+  case EltwiseParameter_EltwiseOp_PROD:
+    caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
+        top_data);
+    for (int i = 2; i < bottom.size(); ++i) {
+      caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
+    }
+    break;
+  case EltwiseParameter_EltwiseOp_SUM:
+    caffe_gpu_set(count, Dtype(0.), top_data);
+    // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
+    for (int i = 0; i < bottom.size(); ++i) {
+      caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
+    }
+    break;
+  case EltwiseParameter_EltwiseOp_MAX:
+    mask = max_idx_.mutable_gpu_data();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data,
+        mask);
+    for (int i = 2; i < bottom.size(); ++i) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      MaxForward(count, top_data, bottom[i]->gpu_data(), i - 1, top_data, mask);
+    }
+    break;
+  default:
+    LOG(FATAL) << "Unknown elementwise operation.";
+  }
+}
+
+template <typename Dtype>
+void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const int* mask = NULL;
+  const int count = top[0]->count();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (propagate_down[i]) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      switch (op_) {
+      case EltwiseParameter_EltwiseOp_PROD:
+        if (stable_prod_grad_) {
+          bool initialized = false;
+          for (int j = 0; j < bottom.size(); ++j) {
+            if (i == j) {
+              continue;
+            }
+            if (!initialized) {
+              caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff);
+              initialized = true;
+            } else {
+              caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
+                  bottom_diff);
+            }
+          }
+        } else {
+          caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
+        }
+        caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
+        break;
+      case EltwiseParameter_EltwiseOp_SUM:
+        if (coeffs_[i] == Dtype(1.)) {
+          caffe_gpu_copy(count, top_diff, bottom_diff);
+        } else {
+          caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
+        }
+        break;
+      case EltwiseParameter_EltwiseOp_MAX:
+        mask = max_idx_.gpu_data();
+        MaxBackward(count, top_diff, i, mask, bottom_diff);
+        break;
+      default:
+        LOG(FATAL) << "Unknown elementwise operation.";
+      }
+    }
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(EltwiseLayer);
 #endif
 
-INSTANTIATE_CLASS(EltwiseLayer);
-REGISTER_LAYER_CLASS(Eltwise);
+INSTANTIATE_CLASS (EltwiseLayer);
+REGISTER_LAYER_CLASS (Eltwise);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu
deleted file mode 100644
index 2247870d..00000000
--- a/src/caffe/layers/eltwise_layer.cu
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void MaxForward(const int nthreads, const Dtype* bottom_data_a,
-    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
-    int* mask) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    if (bottom_data_a[index] > bottom_data_b[index]) {
-      // only update for very first bottom_data blob (blob_idx == 0)
-      if (blob_idx == 0) {
-        maxval = bottom_data_a[index];
-        top_data[index] = maxval;
-        maxidx = blob_idx;
-        mask[index] = maxidx;
-      }
-    } else {
-      maxval = bottom_data_b[index];
-      top_data[index] = maxval;
-      maxidx = blob_idx + 1;
-      mask[index] = maxidx;
-    }
-  }
-}
-
-template <typename Dtype>
-void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int* mask = NULL;
-  const int count = top[0]->count();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  switch (op_) {
-  case EltwiseParameter_EltwiseOp_PROD:
-    caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
-        top_data);
-    for (int i = 2; i < bottom.size(); ++i) {
-      caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_SUM:
-    caffe_gpu_set(count, Dtype(0.), top_data);
-    // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
-    }
-    break;
-  case EltwiseParameter_EltwiseOp_MAX:
-    mask = max_idx_.mutable_gpu_data();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxForward<Dtype> <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask);
-    for (int i = 2; i < bottom.size(); ++i) {
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      MaxForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-          count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask);
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown elementwise operation.";
-  }
-}
-
-template <typename Dtype>
-__global__ void MaxBackward(const int nthreads, const Dtype* top_diff,
-    const int blob_idx, const int* mask, Dtype* bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    Dtype gradient = 0;
-    if (mask[index] == blob_idx) {
-      gradient += top_diff[index];
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-template <typename Dtype>
-void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const int* mask = NULL;
-  const int count = top[0]->count();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
-      switch (op_) {
-      case EltwiseParameter_EltwiseOp_PROD:
-        if (stable_prod_grad_) {
-          bool initialized = false;
-          for (int j = 0; j < bottom.size(); ++j) {
-            if (i == j) { continue; }
-            if (!initialized) {
-              caffe_copy(count, bottom[j]->gpu_data(), bottom_diff);
-              initialized = true;
-            } else {
-              caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff,
-                            bottom_diff);
-            }
-          }
-        } else {
-          caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        }
-        caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
-        break;
-      case EltwiseParameter_EltwiseOp_SUM:
-        if (coeffs_[i] == Dtype(1.)) {
-          caffe_copy(count, top_diff, bottom_diff);
-        } else {
-          caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
-        }
-        break;
-      case EltwiseParameter_EltwiseOp_MAX:
-        mask = max_idx_.gpu_data();
-        MaxBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-            <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-            count, top_diff, i, mask, bottom_diff);
-        break;
-      default:
-        LOG(FATAL) << "Unknown elementwise operation.";
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(EltwiseLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp
index 80efa31b..ea78484b 100644
--- a/src/caffe/layers/euclidean_loss_layer.cpp
+++ b/src/caffe/layers/euclidean_loss_layer.cpp
@@ -8,9 +8,9 @@
 namespace caffe {
 
 template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Reshape(
-  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::Reshape(bottom, top);
+void EuclideanLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::Reshape(bottom, top);
   CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
       << "Inputs must have the same dimension.";
   diff_.ReshapeLike(*bottom[0]);
@@ -20,10 +20,7 @@ template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   int count = bottom[0]->count();
-  caffe_sub(
-      count,
-      bottom[0]->cpu_data(),
-      bottom[1]->cpu_data(),
+  caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(),
       diff_.mutable_cpu_data());
   Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data());
   Dtype loss = dot / bottom[0]->num() / Dtype(2);
@@ -37,8 +34,7 @@ void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     if (propagate_down[i]) {
       const Dtype sign = (i == 0) ? 1 : -1;
       const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
-      caffe_cpu_axpby(
-          bottom[i]->count(),              // count
+      caffe_cpu_axpby(bottom[i]->count(),              // count
           alpha,                              // alpha
           diff_.cpu_data(),                   // a
           Dtype(0),                           // beta
@@ -47,11 +43,42 @@ void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  int count = bottom[0]->count();
+  caffe_gpu_sub(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
+      diff_.mutable_gpu_data());
+  Dtype dot;
+  caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
+  Dtype loss = dot / bottom[0]->num() / Dtype(2);
+  top[0]->mutable_cpu_data()[0] = loss;
+}
+
+template <typename Dtype>
+void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
+      caffe_gpu_axpby(bottom[i]->count(),              // count
+          alpha,                              // alpha
+          diff_.gpu_data(),                   // a
+          Dtype(0),                           // beta
+          bottom[i]->mutable_gpu_diff());  // b
+    }
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(EuclideanLossLayer);
 #endif
 
-INSTANTIATE_CLASS(EuclideanLossLayer);
-REGISTER_LAYER_CLASS(EuclideanLoss);
+INSTANTIATE_CLASS (EuclideanLossLayer);
+REGISTER_LAYER_CLASS (EuclideanLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/euclidean_loss_layer.cu b/src/caffe/layers/euclidean_loss_layer.cu
deleted file mode 100644
index 5b1de3ad..00000000
--- a/src/caffe/layers/euclidean_loss_layer.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int count = bottom[0]->count();
-  caffe_gpu_sub(
-      count,
-      bottom[0]->gpu_data(),
-      bottom[1]->gpu_data(),
-      diff_.mutable_gpu_data());
-  Dtype dot;
-  caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot);
-  Dtype loss = dot / bottom[0]->num() / Dtype(2);
-  top[0]->mutable_cpu_data()[0] = loss;
-}
-
-template <typename Dtype>
-void EuclideanLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
-      caffe_gpu_axpby(
-          bottom[i]->count(),              // count
-          alpha,                              // alpha
-          diff_.gpu_data(),                   // a
-          Dtype(0),                           // beta
-          bottom[i]->mutable_gpu_diff());  // b
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(EuclideanLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp
index c7e7c60c..ad40bb1b 100644
--- a/src/caffe/layers/exp_layer.cpp
+++ b/src/caffe/layers/exp_layer.cpp
@@ -9,8 +9,8 @@ namespace caffe {
 
 template <typename Dtype>
 void ExpLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
   const Dtype base = this->layer_param_.exp_param().base();
   if (base != Dtype(-1)) {
     CHECK_GT(base, 0) << "base must be strictly positive.";
@@ -18,10 +18,10 @@ void ExpLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // If base == -1, interpret the base as e and set log_base = 1 exactly.
   // Otherwise, calculate its log explicitly.
   const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
-  CHECK(!isnan(log_base))
-      << "NaN result: log(base) = log(" << base << ") = " << log_base;
-  CHECK(!isinf(log_base))
-      << "Inf result: log(base) = log(" << base << ") = " << log_base;
+  CHECK(!isnan(log_base)) << "NaN result: log(base) = log(" << base << ") = "
+      << log_base;
+  CHECK(!isinf(log_base)) << "Inf result: log(base) = log(" << base << ") = "
+      << log_base;
   const Dtype input_scale = this->layer_param_.exp_param().scale();
   const Dtype input_shift = this->layer_param_.exp_param().shift();
   inner_scale_ = log_base * input_scale;
@@ -48,7 +48,9 @@ void ExpLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
+  if (!propagate_down[0]) {
+    return;
+  }
   const int count = bottom[0]->count();
   const Dtype* top_data = top[0]->cpu_data();
   const Dtype* top_diff = top[0]->cpu_diff();
@@ -59,11 +61,47 @@ void ExpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+// begin: code modified for OpenCL port
+#ifndef CPU_ONLY
+template <typename Dtype>
+void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int count = bottom[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  if (inner_scale_ == Dtype(1)) {
+    caffe_gpu_exp(count, bottom_data, top_data);
+  } else {
+    caffe_gpu_scale(count, inner_scale_, bottom_data, top_data);
+    caffe_gpu_exp(count, top_data, top_data);
+  }
+  if (outer_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, outer_scale_, top_data);
+  }
+}
+
+template <typename Dtype>
+void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  const int count = bottom[0]->count();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  caffe_gpu_mul(count, top_data, top_diff, bottom_diff);
+  if (inner_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, inner_scale_, bottom_diff);
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(ExpLayer);
 #endif
 
-INSTANTIATE_CLASS(ExpLayer);
-REGISTER_LAYER_CLASS(Exp);
+INSTANTIATE_CLASS (ExpLayer);
+REGISTER_LAYER_CLASS (Exp);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/exp_layer.cu b/src/caffe/layers/exp_layer.cu
deleted file mode 100644
index 2d75d8dd..00000000
--- a/src/caffe/layers/exp_layer.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ExpLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  if (inner_scale_ == Dtype(1)) {
-    caffe_gpu_exp(count, bottom_data, top_data);
-  } else {
-    caffe_gpu_scale(count, inner_scale_, bottom_data, top_data);
-    caffe_gpu_exp(count, top_data, top_data);
-  }
-  if (outer_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, outer_scale_, top_data);
-  }
-}
-
-template <typename Dtype>
-void ExpLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  const int count = bottom[0]->count();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  caffe_gpu_mul(count, top_data, top_diff, bottom_diff);
-  if (inner_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, inner_scale_, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ExpLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp
index be1db32d..884764b4 100644
--- a/src/caffe/layers/filter_layer.cpp
+++ b/src/caffe/layers/filter_layer.cpp
@@ -9,14 +9,14 @@ namespace caffe {
 
 template <typename Dtype>
 void FilterLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(top.size(), bottom.size() - 1);
   first_reshape_ = true;
 }
 
 template <typename Dtype>
 void FilterLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   // bottom[0...k-1] are the blobs to filter
   // bottom[last] is the "selector_blob"
   int selector_index = bottom.size() - 1;
@@ -25,8 +25,8 @@ void FilterLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
         << "Selector blob dimensions must be singletons (1), except the first";
   }
   for (int i = 0; i < bottom.size() - 1; ++i) {
-    CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) <<
-        "Each bottom should have the same 0th dimension as the selector blob";
+    CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0))
+        << "Each bottom should have the same 0th dimension as the selector blob";
   }
 
   const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data();
@@ -61,7 +61,7 @@ void FilterLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void FilterLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   int new_tops_num = indices_to_forward_.size();
   // forward all filtered items for all bottoms but the Selector (bottom[last])
   for (int t = 0; t < top.size(); ++t) {
@@ -79,10 +79,10 @@ void FilterLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[bottom.size() - 1]) {
     LOG(FATAL) << this->type()
-               << "Layer cannot backpropagate to filter index inputs";
+        << "Layer cannot backpropagate to filter index inputs";
   }
   for (int i = 0; i < top.size(); i++) {
     // bottom[last] is the selector and never needs backpropagation
@@ -117,11 +117,73 @@ void FilterLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+// begin: code modified for OpenCL port
+#ifndef CPU_ONLY
+template <typename Dtype>
+void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  int new_tops_num = indices_to_forward_.size();
+  // forward all filtered items for all bottoms but the Selector (bottom[last])
+  for (int t = 0; t < top.size(); ++t) {
+    const Dtype* bottom_data = bottom[t]->gpu_data();
+    Dtype* top_data = top[t]->mutable_gpu_data();
+    int dim = bottom[t]->count() / bottom[t]->shape(0);
+    for (int n = 0; n < new_tops_num; ++n) {
+      int data_offset_top = n * dim;
+      int data_offset_bottom = indices_to_forward_[n] * dim;
+      caffe_copy(dim, bottom_data + data_offset_bottom,
+          top_data + data_offset_top);
+    }
+  }
+}
+
+template <typename Dtype>
+void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[bottom.size() - 1]) {
+    LOG(FATAL) << this->type()
+        << "Layer cannot backpropagate to filter index inputs";
+  }
+  for (int i = 0; i < top.size(); ++i) {
+    // bottom[last] is the selector and never needs backpropagation
+    // so we can iterate over top vector because top.size() == bottom.size() -1
+    if (propagate_down[i]) {
+      const int dim = top[i]->count() / top[i]->shape(0);
+      int next_to_backward_offset = 0;
+      int batch_offset = 0;
+      int data_offset_bottom = 0;
+      int data_offset_top = 0;
+      for (int n = 0; n < bottom[i]->shape(0); ++n) {
+        if (next_to_backward_offset >= indices_to_forward_.size()) {
+          // we already visited all items that were been forwarded, so
+          // just set to zero remaining ones
+          data_offset_bottom = n * dim;
+          caffe_gpu_set(dim, Dtype(0),
+              bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+        } else {
+          batch_offset = indices_to_forward_[next_to_backward_offset];
+          data_offset_bottom = n * dim;
+          if (n != batch_offset) {  // this data was not been forwarded
+            caffe_gpu_set(dim, Dtype(0),
+                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+          } else {  // this data was been forwarded
+            data_offset_top = next_to_backward_offset * dim;
+            ++next_to_backward_offset;  // point to next forwarded item index
+            caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top,
+                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
+          }
+        }
+      }
+    }
+  }
+}
+
+// end: code modified for OpenCL port
+#else
 STUB_GPU(FilterLayer);
 #endif
 
-INSTANTIATE_CLASS(FilterLayer);
-REGISTER_LAYER_CLASS(Filter);
+INSTANTIATE_CLASS (FilterLayer);
+REGISTER_LAYER_CLASS (Filter);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/filter_layer.cu b/src/caffe/layers/filter_layer.cu
deleted file mode 100644
index cf929eee..00000000
--- a/src/caffe/layers/filter_layer.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void FilterLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  int new_tops_num = indices_to_forward_.size();
-  // forward all filtered items for all bottoms but the Selector (bottom[last])
-  for (int t = 0; t < top.size(); ++t) {
-    const Dtype* bottom_data = bottom[t]->gpu_data();
-    Dtype* top_data = top[t]->mutable_gpu_data();
-    int dim = bottom[t]->count() / bottom[t]->shape(0);
-    for (int n = 0; n < new_tops_num; ++n) {
-      int data_offset_top = n * dim;
-      int data_offset_bottom = indices_to_forward_[n] * dim;
-      caffe_copy(dim, bottom_data + data_offset_bottom,
-          top_data + data_offset_top);
-    }
-  }
-}
-
-template <typename Dtype>
-void FilterLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[bottom.size() - 1]) {
-    LOG(FATAL) << this->type()
-               << "Layer cannot backpropagate to filter index inputs";
-  }
-  for (int i = 0; i < top.size(); ++i) {
-    // bottom[last] is the selector and never needs backpropagation
-    // so we can iterate over top vector because top.size() == bottom.size() -1
-    if (propagate_down[i]) {
-      const int dim = top[i]->count() / top[i]->shape(0);
-      int next_to_backward_offset = 0;
-      int batch_offset = 0;
-      int data_offset_bottom = 0;
-      int data_offset_top = 0;
-      for (int n = 0; n < bottom[i]->shape(0); ++n) {
-        if (next_to_backward_offset >= indices_to_forward_.size()) {
-          // we already visited all items that were been forwarded, so
-          // just set to zero remaining ones
-          data_offset_bottom = n * dim;
-          caffe_gpu_set(dim, Dtype(0),
-              bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-        } else {
-          batch_offset = indices_to_forward_[next_to_backward_offset];
-          data_offset_bottom = n * dim;
-          if (n != batch_offset) {  // this data was not been forwarded
-            caffe_gpu_set(dim, Dtype(0),
-                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-          } else {  // this data was been forwarded
-            data_offset_top = next_to_backward_offset * dim;
-            ++next_to_backward_offset;  // point to next forwarded item index
-            caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top,
-                bottom[i]->mutable_gpu_diff() + data_offset_bottom);
-          }
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(FilterLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index f7e5c9c2..997f213d 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -8,7 +8,7 @@ namespace caffe {
 
 template <typename Dtype>
 void FlattenLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const int start_axis = bottom[0]->CanonicalAxisIndex(
       this->layer_param_.flatten_param().axis());
   const int end_axis = bottom[0]->CanonicalAxisIndex(
@@ -28,17 +28,17 @@ void FlattenLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void FlattenLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   top[0]->ShareData(*bottom[0]);
 }
 
 template <typename Dtype>
 void FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   bottom[0]->ShareDiff(*top[0]);
 }
 
-INSTANTIATE_CLASS(FlattenLayer);
-REGISTER_LAYER_CLASS(Flatten);
+INSTANTIATE_CLASS (FlattenLayer);
+REGISTER_LAYER_CLASS (Flatten);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 8a782f7e..c87304b0 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -1,11 +1,11 @@
 /*
-TODO:
-- load file in a separate thread ("prefetch")
-- can be smarter about the memcpy call instead of doing it row-by-row
-  :: use util functions caffe_copy, and Blob->offset()
-  :: don't forget to update hdf5_daa_layer.cu accordingly
-- add ability to shuffle filenames if flag is set
-*/
+ TODO:
+ - load file in a separate thread ("prefetch")
+ - can be smarter about the memcpy call instead of doing it row-by-row
+ :: use util functions caffe_copy, and Blob->offset()
+ :: don't forget to update hdf5_daa_layer.cu accordingly
+ - add ability to shuffle filenames if flag is set
+ */
 #include <fstream>  // NOLINT(readability/streams)
 #include <string>
 #include <vector>
@@ -21,7 +21,8 @@
 namespace caffe {
 
 template <typename Dtype>
-HDF5DataLayer<Dtype>::~HDF5DataLayer<Dtype>() { }
+HDF5DataLayer<Dtype>::~HDF5DataLayer<Dtype>() {
+}
 
 // Load data and label from HDF5 filename into the class property blobs.
 template <typename Dtype>
@@ -39,7 +40,7 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
   const int MAX_DATA_DIM = INT_MAX;
 
   for (int i = 0; i < top_size; ++i) {
-    hdf_blobs_[i] = shared_ptr<Blob<Dtype> >(new Blob<Dtype>());
+    hdf_blobs_[i] = shared_ptr < Blob<Dtype> > (new Blob<Dtype>());
     hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(),
         MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get());
   }
@@ -63,7 +64,7 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
   if (this->layer_param_.hdf5_data_param().shuffle()) {
     std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
     DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0)
-               << " rows (shuffled)";
+        << " rows (shuffled)";
   } else {
     DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows";
   }
@@ -71,10 +72,10 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
 
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   // Refuse transformation parameters since HDF5 is totally generic.
-  CHECK(!this->layer_param_.has_transform_param()) <<
-      this->type() << " does not transform data.";
+  CHECK(!this->layer_param_.has_transform_param()) << this->type()
+      << " does not transform data.";
   // Read the source to parse the filenames.
   const string& source = this->layer_param_.hdf5_data_param().source();
   LOG(INFO) << "Loading list of HDF5 filenames from: " << source;
@@ -93,7 +94,7 @@ void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   current_file_ = 0;
   LOG(INFO) << "Number of HDF5 files: " << num_files_;
   CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in "
-    << source;
+      << source;
 
   file_permutation_.clear();
   file_permutation_.resize(num_files_);
@@ -127,7 +128,7 @@ void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
   for (int i = 0; i < batch_size; ++i, ++current_row_) {
     if (current_row_ == hdf_blobs_[0]->shape(0)) {
@@ -137,7 +138,7 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
           current_file_ = 0;
           if (this->layer_param_.hdf5_data_param().shuffle()) {
             std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
+                file_permutation_.end());
           }
           DLOG(INFO) << "Looping around to first file.";
         }
@@ -151,17 +152,58 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     for (int j = 0; j < this->layer_param_.top_size(); ++j) {
       int data_dim = top[j]->count() / top[j]->shape(0);
       caffe_copy(data_dim,
-          &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
-            * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
+          &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim],
+          &top[j]->mutable_cpu_data()[i * data_dim]);
+    }
+  }
+}
+
+// begin: code modified for OpenCL port
+#ifndef CPU_ONLY
+template <typename Dtype>
+void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
+  for (int i = 0; i < batch_size; ++i, ++current_row_) {
+    if (current_row_ == hdf_blobs_[0]->shape(0)) {
+      if (num_files_ > 1) {
+        current_file_ += 1;
+        if (current_file_ == num_files_) {
+          current_file_ = 0;
+          if (this->layer_param_.hdf5_data_param().shuffle()) {
+            std::random_shuffle(file_permutation_.begin(),
+                file_permutation_.end());
+          }
+          DLOG(INFO) << "Looping around to first file.";
+        }
+        LoadHDF5FileData(
+            hdf_filenames_[file_permutation_[current_file_]].c_str());
+      }
+      current_row_ = 0;
+      if (this->layer_param_.hdf5_data_param().shuffle())
+        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+    }
+    for (int j = 0; j < this->layer_param_.top_size(); ++j) {
+      int data_dim = top[j]->count() / top[j]->shape(0);
+      OCL_CHECK(
+          clEnqueueWriteBuffer(amdDevice.CommandQueue,
+              (cl_mem) top[j]->mutable_gpu_data(), CL_TRUE,
+              i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim,
+              &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
+                  * data_dim], 0, NULL, NULL));
+      //caffe_copy(data_dim,
+      //    &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
+      //      * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
     }
   }
 }
+// end: code modified for OpenCL port
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU_FORWARD(HDF5DataLayer, Forward);
 #endif
 
-INSTANTIATE_CLASS(HDF5DataLayer);
-REGISTER_LAYER_CLASS(HDF5Data);
+INSTANTIATE_CLASS (HDF5DataLayer);
+REGISTER_LAYER_CLASS (HDF5Data);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
deleted file mode 100644
index 5e3e4ced..00000000
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-TODO:
-- only load parts of the file, in accordance with a prototxt param "max_mem"
-*/
-
-#include <stdint.h>
-#include <string>
-#include <vector>
-
-#include "hdf5.h"
-#include "hdf5_hl.h"
-
-#include "caffe/data_layers.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == hdf_blobs_[0]->shape(0)) {
-      if (num_files_ > 1) {
-        current_file_ += 1;
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          if (this->layer_param_.hdf5_data_param().shuffle()) {
-            std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
-          }
-          DLOG(INFO) << "Looping around to first file.";
-        }
-        LoadHDF5FileData(
-            hdf_filenames_[file_permutation_[current_file_]].c_str());
-      }
-      current_row_ = 0;
-      if (this->layer_param_.hdf5_data_param().shuffle())
-        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-    }
-    for (int j = 0; j < this->layer_param_.top_size(); ++j) {
-      int data_dim = top[j]->count() / top[j]->shape(0);
-      caffe_copy(data_dim,
-          &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
-            * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]);
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(HDF5DataLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index f63375c3..0005fb94 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -16,7 +16,7 @@ void HDF5OutputLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   file_name_ = this->layer_param_.hdf5_output_param().file_name();
   file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
-                       H5P_DEFAULT);
+      H5P_DEFAULT);
   CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_;
   file_opened_ = true;
 }
@@ -33,8 +33,8 @@ template <typename Dtype>
 void HDF5OutputLayer<Dtype>::SaveBlobs() {
   // TODO: no limit on the number of blobs
   LOG(INFO) << "Saving HDF5 file " << file_name_;
-  CHECK_EQ(data_blob_.num(), label_blob_.num()) <<
-      "data blob and label blob must have the same batch size";
+  CHECK_EQ(data_blob_.num(), label_blob_.num())
+      << "data blob and label blob must have the same batch size";
   hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_);
   hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_);
   LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows";
@@ -42,13 +42,13 @@ void HDF5OutputLayer<Dtype>::SaveBlobs() {
 
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   CHECK_GE(bottom.size(), 2);
   CHECK_EQ(bottom[0]->num(), bottom[1]->num());
   data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-                     bottom[0]->height(), bottom[0]->width());
+      bottom[0]->height(), bottom[0]->width());
   label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
-                     bottom[1]->height(), bottom[1]->width());
+      bottom[1]->height(), bottom[1]->width());
   const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
   const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
 
@@ -63,15 +63,53 @@ void HDF5OutputLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void HDF5OutputLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  return;
+}
+
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom.size(), 2);
+  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
+  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
+      bottom[1]->height(), bottom[1]->width());
+  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
+  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
+
+  for (int i = 0; i < bottom[0]->num(); ++i) {
+    OCL_CHECK(
+        clEnqueueReadBuffer(amdDevice.CommandQueue,
+            (cl_mem) bottom[0]->gpu_data(), CL_TRUE,
+            i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim,
+            &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL));
+    OCL_CHECK(
+        clEnqueueReadBuffer(amdDevice.CommandQueue,
+            (cl_mem) bottom[1]->gpu_data(), CL_TRUE,
+            i * label_datum_dim * sizeof(Dtype),
+            sizeof(Dtype) * label_datum_dim,
+            &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL,
+            NULL));
+  }
+  SaveBlobs();
+}
+
+template <typename Dtype>
+void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   return;
 }
+// end: code modified for OpenCL port
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(HDF5OutputLayer);
 #endif
 
-INSTANTIATE_CLASS(HDF5OutputLayer);
-REGISTER_LAYER_CLASS(HDF5Output);
+INSTANTIATE_CLASS (HDF5OutputLayer);
+REGISTER_LAYER_CLASS (HDF5Output);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/hdf5_output_layer.cu
deleted file mode 100644
index ae497c34..00000000
--- a/src/caffe/layers/hdf5_output_layer.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-#include <vector>
-
-#include "hdf5.h"
-#include "hdf5_hl.h"
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void HDF5OutputLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  CHECK_GE(bottom.size(), 2);
-  CHECK_EQ(bottom[0]->num(), bottom[1]->num());
-  data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-                     bottom[0]->height(), bottom[0]->width());
-  label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(),
-                     bottom[1]->height(), bottom[1]->width());
-  const int data_datum_dim = bottom[0]->count() / bottom[0]->num();
-  const int label_datum_dim = bottom[1]->count() / bottom[1]->num();
-
-  for (int i = 0; i < bottom[0]->num(); ++i) {
-    caffe_copy(data_datum_dim, &bottom[0]->gpu_data()[i * data_datum_dim],
-        &data_blob_.mutable_cpu_data()[i * data_datum_dim]);
-    caffe_copy(label_datum_dim, &bottom[1]->gpu_data()[i * label_datum_dim],
-        &label_blob_.mutable_cpu_data()[i * label_datum_dim]);
-  }
-  SaveBlobs();
-}
-
-template <typename Dtype>
-void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  return;
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(HDF5OutputLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp
index a2fb2a18..b2259859 100644
--- a/src/caffe/layers/hinge_loss_layer.cpp
+++ b/src/caffe/layers/hinge_loss_layer.cpp
@@ -26,8 +26,8 @@ void HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
   for (int i = 0; i < num; ++i) {
     for (int j = 0; j < dim; ++j) {
-      bottom_diff[i * dim + j] = std::max(
-        Dtype(0), 1 + bottom_diff[i * dim + j]);
+      bottom_diff[i * dim + j] = std::max(Dtype(0),
+          1 + bottom_diff[i * dim + j]);
     }
   }
   Dtype* loss = top[0]->mutable_cpu_data();
@@ -48,7 +48,7 @@ void HingeLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[1]) {
     LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
+        << " Layer cannot backpropagate to label inputs.";
   }
   if (propagate_down[0]) {
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
@@ -76,7 +76,7 @@ void HingeLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-INSTANTIATE_CLASS(HingeLossLayer);
-REGISTER_LAYER_CLASS(HingeLoss);
+INSTANTIATE_CLASS (HingeLossLayer);
+REGISTER_LAYER_CLASS (HingeLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index 1c802714..36245446 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -9,21 +9,24 @@ namespace caffe {
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-  CHECK(!conv_param.has_kernel_size() !=
-      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+  CHECK(
+      !conv_param.has_kernel_size()
+          != !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
       << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-  CHECK(conv_param.has_kernel_size() ||
-      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
+  CHECK(
+      conv_param.has_kernel_size()
+          || (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
       << "For non-square filters both kernel_h and kernel_w are required.";
-  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
-      && conv_param.has_pad_w())
-      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
+  CHECK(
+      (!conv_param.has_pad() && conv_param.has_pad_h() && conv_param.has_pad_w())
+          || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
       << "pad is pad OR pad_h and pad_w are required.";
-  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
-      && conv_param.has_stride_w())
-      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
+  CHECK(
+      (!conv_param.has_stride() && conv_param.has_stride_h()
+          && conv_param.has_stride_w())
+          || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
       << "Stride is stride OR stride_h and stride_w are required.";
   if (conv_param.has_kernel_size()) {
     kernel_h_ = kernel_w_ = conv_param.kernel_size();
@@ -49,47 +52,74 @@ void Im2colLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
       << "corresponding to (num, channels, height, width)";
   channels_ = bottom[0]->channels();
   height_ = bottom[0]->height();
   width_ = bottom[0]->width();
-  top[0]->Reshape(
-      bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
+  top[0]->Reshape(bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
       (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
       (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
 }
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
   for (int n = 0; n < bottom[0]->num(); ++n) {
-    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, top_data + top[0]->offset(n));
+    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+        top_data + top[0]->offset(n));
   }
 }
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
   for (int n = 0; n < top[0]->num(); ++n) {
     col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+        bottom_diff + bottom[0]->offset(n));
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  for (int n = 0; n < bottom[0]->num(); ++n) {
+    im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_, width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, top_data,
+        top[0]->offset(n));
+  }
+}
+
+template <typename Dtype>
+void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  for (int n = 0; n < top[0]->num(); ++n) {
+    col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_,
+        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, bottom_diff,
+        bottom[0]->offset(n));
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(Im2colLayer);
 #endif
 
-INSTANTIATE_CLASS(Im2colLayer);
-REGISTER_LAYER_CLASS(Im2col);
+INSTANTIATE_CLASS (Im2colLayer);
+REGISTER_LAYER_CLASS (Im2col);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu
deleted file mode 100644
index 9c338b14..00000000
--- a/src/caffe/layers/im2col_layer.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/im2col.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  for (int n = 0; n < bottom[0]->num(); ++n) {
-    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, top_data + top[0]->offset(n));
-  }
-}
-
-template <typename Dtype>
-void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int n = 0; n < top[0]->num(); ++n) {
-    col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(Im2colLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 18c035cb..21957551 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -22,15 +22,16 @@ ImageDataLayer<Dtype>::~ImageDataLayer<Dtype>() {
 
 template <typename Dtype>
 void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const int new_height = this->layer_param_.image_data_param().new_height();
-  const int new_width  = this->layer_param_.image_data_param().new_width();
-  const bool is_color  = this->layer_param_.image_data_param().is_color();
+  const int new_width = this->layer_param_.image_data_param().new_width();
+  const bool is_color = this->layer_param_.image_data_param().is_color();
   string root_folder = this->layer_param_.image_data_param().root_folder();
 
-  CHECK((new_height == 0 && new_width == 0) ||
-      (new_height > 0 && new_width > 0)) << "Current implementation requires "
-      "new_height and new_width to be set at the same time.";
+  CHECK(
+      (new_height == 0 && new_width == 0) || (new_height > 0 && new_width > 0))
+      << "Current implementation requires "
+          "new_height and new_width to be set at the same time.";
   // Read the file with filenames and labels
   const string& source = this->layer_param_.image_data_param().source();
   LOG(INFO) << "Opening file " << source;
@@ -53,15 +54,15 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   lines_id_ = 0;
   // Check if we would need to randomly skip a few data points
   if (this->layer_param_.image_data_param().rand_skip()) {
-    unsigned int skip = caffe_rng_rand() %
-        this->layer_param_.image_data_param().rand_skip();
+    unsigned int skip = caffe_rng_rand()
+        % this->layer_param_.image_data_param().rand_skip();
     LOG(INFO) << "Skipping first " << skip << " data points.";
     CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
     lines_id_ = skip;
   }
   // Read an image, and use it to initialize the top blob.
   cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
-                                    new_height, new_width, is_color);
+      new_height, new_width, is_color);
   // Use data_transformer to infer the expected blob shape from a cv_image.
   vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
   this->transformed_data_.Reshape(top_shape);
@@ -153,7 +154,7 @@ void ImageDataLayer<Dtype>::InternalThreadEntry() {
   DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
 }
 
-INSTANTIATE_CLASS(ImageDataLayer);
-REGISTER_LAYER_CLASS(ImageData);
+INSTANTIATE_CLASS (ImageDataLayer);
+REGISTER_LAYER_CLASS (ImageData);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp
index a1e0b40d..ffd2ab97 100644
--- a/src/caffe/layers/infogain_loss_layer.cpp
+++ b/src/caffe/layers/infogain_loss_layer.cpp
@@ -11,24 +11,24 @@
 namespace caffe {
 
 template <typename Dtype>
-void InfogainLossLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::LayerSetUp(bottom, top);
+void InfogainLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::LayerSetUp(bottom, top);
   if (bottom.size() < 3) {
     CHECK(this->layer_param_.infogain_loss_param().has_source())
         << "Infogain matrix source must be specified.";
     BlobProto blob_proto;
-    ReadProtoFromBinaryFile(
-      this->layer_param_.infogain_loss_param().source(), &blob_proto);
+    ReadProtoFromBinaryFile(this->layer_param_.infogain_loss_param().source(),
+        &blob_proto);
     infogain_.FromProto(blob_proto);
   }
 }
 
 template <typename Dtype>
-void InfogainLossLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::Reshape(bottom, top);
-  Blob<Dtype>* infogain = NULL;
+void InfogainLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::Reshape(bottom, top);
+  Blob < Dtype > *infogain = NULL;
   if (bottom.size() < 3) {
     infogain = &infogain_;
   } else {
@@ -45,7 +45,6 @@ void InfogainLossLayer<Dtype>::Reshape(
   CHECK_EQ(infogain->width(), dim);
 }
 
-
 template <typename Dtype>
 void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -72,15 +71,14 @@ void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[1]) {
     LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
+        << " Layer cannot backpropagate to label inputs.";
   }
   if (propagate_down.size() > 2 && propagate_down[2]) {
     LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to infogain inputs.";
+        << " Layer cannot backpropagate to infogain inputs.";
   }
   if (propagate_down[0]) {
     const Dtype* bottom_data = bottom[0]->cpu_data();
@@ -94,7 +92,7 @@ void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
     int num = bottom[0]->num();
     int dim = bottom[0]->count() / bottom[0]->num();
-    const Dtype scale = - top[0]->cpu_diff()[0] / num;
+    const Dtype scale = -top[0]->cpu_diff()[0] / num;
     for (int i = 0; i < num; ++i) {
       const int label = static_cast<int>(bottom_label[i]);
       for (int j = 0; j < dim; ++j) {
@@ -105,6 +103,6 @@ void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-INSTANTIATE_CLASS(InfogainLossLayer);
-REGISTER_LAYER_CLASS(InfogainLoss);
+INSTANTIATE_CLASS (InfogainLossLayer);
+REGISTER_LAYER_CLASS (InfogainLoss);
 }  // namespace caffe
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 83c3235e..cfa4246a 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -11,7 +11,7 @@ namespace caffe {
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const int num_output = this->layer_param_.inner_product_param().num_output();
   bias_term_ = this->layer_param_.inner_product_param().bias_term();
   N_ = num_output;
@@ -36,15 +36,19 @@ void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     weight_shape[1] = K_;
     this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
     // fill the weights
-    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
-        this->layer_param_.inner_product_param().weight_filler()));
+    shared_ptr < Filler<Dtype>
+        > weight_filler(
+            GetFiller < Dtype
+                > (this->layer_param_.inner_product_param().weight_filler()));
     weight_filler->Fill(this->blobs_[0].get());
     // If necessary, intiialize and fill the bias term
     if (bias_term_) {
       vector<int> bias_shape(1, N_);
       this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
-      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
-          this->layer_param_.inner_product_param().bias_filler()));
+      shared_ptr < Filler<Dtype>
+          > bias_filler(
+              GetFiller < Dtype
+                  > (this->layer_param_.inner_product_param().bias_filler()));
       bias_filler->Fill(this->blobs_[1].get());
     }
   }  // parameter initialization
@@ -53,7 +57,7 @@ void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   // Figure out the dimensions
   const int axis = bottom[0]->CanonicalAxisIndex(
       this->layer_param_.inner_product_param().axis());
@@ -83,47 +87,84 @@ void InnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
   const Dtype* weight = this->blobs_[0]->cpu_data();
-  caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
-      bottom_data, weight, (Dtype)0., top_data);
+  caffe_cpu_gemm < Dtype
+      > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., bottom_data, weight, (Dtype) 0., top_data);
   if (bias_term_) {
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
-        bias_multiplier_.cpu_data(),
-        this->blobs_[1]->cpu_data(), (Dtype)1., top_data);
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype) 1., bias_multiplier_.cpu_data(), this->blobs_[1]->cpu_data(), (Dtype) 1., top_data);
   }
 }
 
 template <typename Dtype>
 void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (this->param_propagate_down_[0]) {
     const Dtype* top_diff = top[0]->cpu_diff();
     const Dtype* bottom_data = bottom[0]->cpu_data();
     // Gradient with respect to weight
-    caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
+    caffe_cpu_gemm < Dtype
+        > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff());
   }
   if (bias_term_ && this->param_propagate_down_[1]) {
     const Dtype* top_diff = top[0]->cpu_diff();
     // Gradient with respect to bias
-    caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        bias_multiplier_.cpu_data(), (Dtype)1.,
-        this->blobs_[1]->mutable_cpu_diff());
+    caffe_cpu_gemv < Dtype
+        > (CblasTrans, M_, N_, (Dtype) 1., top_diff, bias_multiplier_.cpu_data(), (Dtype) 1., this->blobs_[1]->mutable_cpu_diff());
   }
   if (propagate_down[0]) {
     const Dtype* top_diff = top[0]->cpu_diff();
     // Gradient with respect to bottom data
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, this->blobs_[0]->cpu_data(), (Dtype)0.,
-        bottom[0]->mutable_cpu_diff());
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0., bottom[0]->mutable_cpu_diff());
+  }
+}
+
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  caffe_gpu_gemm < Dtype
+      > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0);
+  if (bias_term_) {
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype) 1., bias_multiplier_.gpu_data(), 0, this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0);
+  }
+}
+
+template <typename Dtype>
+void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (this->param_propagate_down_[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    // Gradient with respect to weight
+    caffe_gpu_gemm < Dtype
+        > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0);
+  }
+  if (bias_term_ && this->param_propagate_down_[1]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    // Gradient with respect to bias
+    caffe_gpu_gemv < Dtype
+        > (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff, (size_t) 0, N_, reinterpret_cast<const Dtype*>(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 0., 1, this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1);
+  }
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    // Gradient with respect to bottom data
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0., bottom[0]->mutable_gpu_diff(), 0);
   }
 }
+// end: code modified for OpenCL port
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(InnerProductLayer);
 #endif
 
-INSTANTIATE_CLASS(InnerProductLayer);
-REGISTER_LAYER_CLASS(InnerProduct);
+INSTANTIATE_CLASS (InnerProductLayer);
+REGISTER_LAYER_CLASS (InnerProduct);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu
deleted file mode 100644
index dd90cac1..00000000
--- a/src/caffe/layers/inner_product_layer.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/common.hpp"
-#include "caffe/filler.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const Dtype* weight = this->blobs_[0]->gpu_data();
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
-      bottom_data, weight, (Dtype)0., top_data);
-  if (bias_term_) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
-        bias_multiplier_.gpu_data(),
-        this->blobs_[1]->gpu_data(), (Dtype)1., top_data);
-  }
-}
-
-template <typename Dtype>
-void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (this->param_propagate_down_[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    // Gradient with respect to weight
-    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
-  }
-  if (bias_term_ && this->param_propagate_down_[1]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    // Gradient with respect to bias
-    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        bias_multiplier_.gpu_data(), (Dtype)1.,
-        this->blobs_[1]->mutable_gpu_diff());
-  }
-  if (propagate_down[0]) {
-    const Dtype* top_diff = top[0]->gpu_diff();
-    // Gradient with respect to bottom data
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
-        top_diff, this->blobs_[0]->gpu_data(), (Dtype)0.,
-        bottom[0]->mutable_gpu_diff());
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(InnerProductLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp
index 55a227f6..a01c9c18 100644
--- a/src/caffe/layers/log_layer.cpp
+++ b/src/caffe/layers/log_layer.cpp
@@ -9,8 +9,8 @@ namespace caffe {
 
 template <typename Dtype>
 void LogLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
   const Dtype base = this->layer_param_.log_param().base();
   if (base != Dtype(-1)) {
     CHECK_GT(base, 0) << "base must be strictly positive.";
@@ -18,15 +18,15 @@ void LogLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // If base == -1, interpret the base as e and set log_base = 1 exactly.
   // Otherwise, calculate its log explicitly.
   const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base);
-  CHECK(!isnan(log_base))
-      << "NaN result: log(base) = log(" << base << ") = " << log_base;
-  CHECK(!isinf(log_base))
-      << "Inf result: log(base) = log(" << base << ") = " << log_base;
+  CHECK(!isnan(log_base)) << "NaN result: log(base) = log(" << base << ") = "
+      << log_base;
+  CHECK(!isinf(log_base)) << "Inf result: log(base) = log(" << base << ") = "
+      << log_base;
   base_scale_ = Dtype(1) / log_base;
-  CHECK(!isnan(base_scale_))
-      << "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
-  CHECK(!isinf(base_scale_))
-      << "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_;
+  CHECK(!isnan(base_scale_)) << "NaN result: 1/log(base) = 1/log(" << base
+      << ") = " << base_scale_;
+  CHECK(!isinf(base_scale_)) << "Inf result: 1/log(base) = 1/log(" << base
+      << ") = " << base_scale_;
   input_scale_ = this->layer_param_.log_param().scale();
   input_shift_ = this->layer_param_.log_param().shift();
   backward_num_scale_ = input_scale_ / log_base;
@@ -58,7 +58,9 @@ void LogLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void LogLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
+  if (!propagate_down[0]) {
+    return;
+  }
   const int count = bottom[0]->count();
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const Dtype* top_diff = top[0]->cpu_diff();
@@ -77,11 +79,61 @@ void LogLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   caffe_mul(count, top_diff, bottom_diff, bottom_diff);
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int count = bottom[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
+    caffe_gpu_log(count, bottom_data, top_data);
+  } else {
+    caffe_gpu_copy(count, bottom_data, top_data);
+    if (input_scale_ != Dtype(1)) {
+      caffe_gpu_scal(count, input_scale_, top_data);
+    }
+    if (input_shift_ != Dtype(0)) {
+      caffe_gpu_add_scalar(count, input_shift_, top_data);
+    }
+    caffe_gpu_log(count, top_data, top_data);
+  }
+  if (base_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, base_scale_, top_data);
+  }
+}
+
+template <typename Dtype>
+void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  const int count = bottom[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  caffe_gpu_copy(count, bottom_data, bottom_diff);
+  if (input_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, input_scale_, bottom_diff);
+  }
+  if (input_shift_ != Dtype(0)) {
+    caffe_gpu_add_scalar(count, input_shift_, bottom_diff);
+  }
+  caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff);
+  if (backward_num_scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, backward_num_scale_, bottom_diff);
+  }
+  caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(LogLayer);
 #endif
 
-INSTANTIATE_CLASS(LogLayer);
-REGISTER_LAYER_CLASS(Log);
+INSTANTIATE_CLASS (LogLayer);
+REGISTER_LAYER_CLASS (Log);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/log_layer.cu b/src/caffe/layers/log_layer.cu
deleted file mode 100644
index 847c86cd..00000000
--- a/src/caffe/layers/log_layer.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/neuron_layers.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void LogLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const int count = bottom[0]->count();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) {
-    caffe_gpu_log(count, bottom_data, top_data);
-  } else {
-    caffe_copy(count, bottom_data, top_data);
-    if (input_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, input_scale_, top_data);
-    }
-    if (input_shift_ != Dtype(0)) {
-      caffe_gpu_add_scalar(count, input_shift_, top_data);
-    }
-    caffe_gpu_log(count, top_data, top_data);
-  }
-  if (base_scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, base_scale_, top_data);
-  }
-}
-
-template <typename Dtype>
-void LogLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-    const int count = bottom[0]->count();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_copy(count, bottom_data, bottom_diff);
-    if (input_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, input_scale_, bottom_diff);
-    }
-    if (input_shift_ != Dtype(0)) {
-      caffe_gpu_add_scalar(count, input_shift_, bottom_diff);
-    }
-    caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff);
-    if (backward_num_scale_ != Dtype(1)) {
-      caffe_gpu_scal(count, backward_num_scale_, bottom_diff);
-    }
-    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(LogLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index 3496a5c2..64abbaa0 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -11,8 +11,8 @@
 namespace caffe {
 
 template <typename Dtype>
-void LossLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+void LossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
   // LossLayers have a non-zero (1) loss by default.
   if (this->layer_param_.loss_weight_size() == 0) {
     this->layer_param_.add_loss_weight(Dtype(1));
@@ -20,14 +20,14 @@ void LossLayer<Dtype>::LayerSetUp(
 }
 
 template <typename Dtype>
-void LossLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+void LossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(bottom[0]->num(), bottom[1]->num())
       << "The data and label should have the same number.";
   vector<int> loss_shape(0);  // Loss layers output a scalar; 0 axes.
   top[0]->Reshape(loss_shape);
 }
 
-INSTANTIATE_CLASS(LossLayer);
+INSTANTIATE_CLASS (LossLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 36c1ace4..0c91435b 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -3,20 +3,22 @@
 #include "caffe/layer.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/vision_layers.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
+#include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
 template <typename Dtype>
 void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   size_ = this->layer_param_.lrn_param().local_size();
   CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size";
   pre_pad_ = (size_ - 1) / 2;
   alpha_ = this->layer_param_.lrn_param().alpha();
   beta_ = this->layer_param_.lrn_param().beta();
   k_ = this->layer_param_.lrn_param().k();
-  if (this->layer_param_.lrn_param().norm_region() ==
-      LRNParameter_NormRegion_WITHIN_CHANNEL) {
+  if (this->layer_param_.lrn_param().norm_region()
+      == LRNParameter_NormRegion_WITHIN_CHANNEL) {
     // Set up split_layer_ to use inputs in the numerator and denominator.
     split_top_vec_.clear();
     split_top_vec_.push_back(&product_input_);
@@ -68,7 +70,7 @@ void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void LRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
       << "corresponding to (num, channels, height, width)";
   num_ = bottom[0]->num();
@@ -115,46 +117,46 @@ void LRNLayer<Dtype>::CrossChannelForward_cpu(
   for (int i = 0; i < scale_.count(); ++i) {
     scale_data[i] = k_;
   }
-  Blob<Dtype> padded_square(1, channels_ + size_ - 1, height_, width_);
+  Blob < Dtype > padded_square(1, channels_ + size_ - 1, height_, width_);
   Dtype* padded_square_data = padded_square.mutable_cpu_data();
   caffe_set(padded_square.count(), Dtype(0), padded_square_data);
   Dtype alpha_over_size = alpha_ / size_;
   // go through the images
   for (int n = 0; n < num_; ++n) {
     // compute the padded square
-    caffe_sqr(channels_ * height_ * width_,
-        bottom_data + bottom[0]->offset(n),
+    caffe_sqr(channels_ * height_ * width_, bottom_data + bottom[0]->offset(n),
         padded_square_data + padded_square.offset(0, pre_pad_));
     // Create the first channel scale
     for (int c = 0; c < size_; ++c) {
-      caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
-          padded_square_data + padded_square.offset(0, c),
-          scale_data + scale_.offset(n, 0));
+      caffe_axpy < Dtype
+          > (height_ * width_, alpha_over_size, padded_square_data
+              + padded_square.offset(0, c), scale_data + scale_.offset(n, 0));
     }
     for (int c = 1; c < channels_; ++c) {
       // copy previous scale
-      caffe_copy<Dtype>(height_ * width_,
-          scale_data + scale_.offset(n, c - 1),
-          scale_data + scale_.offset(n, c));
+      caffe_copy < Dtype
+          > (height_ * width_, scale_data + scale_.offset(n, c - 1), scale_data
+              + scale_.offset(n, c));
       // add head
-      caffe_axpy<Dtype>(height_ * width_, alpha_over_size,
-          padded_square_data + padded_square.offset(0, c + size_ - 1),
-          scale_data + scale_.offset(n, c));
+      caffe_axpy < Dtype
+          > (height_ * width_, alpha_over_size, padded_square_data
+              + padded_square.offset(0, c + size_ - 1), scale_data
+              + scale_.offset(n, c));
       // subtract tail
-      caffe_axpy<Dtype>(height_ * width_, -alpha_over_size,
-          padded_square_data + padded_square.offset(0, c - 1),
-          scale_data + scale_.offset(n, c));
+      caffe_axpy < Dtype
+          > (height_ * width_, -alpha_over_size, padded_square_data
+              + padded_square.offset(0, c - 1), scale_data + scale_.offset(n, c));
     }
   }
 
   // In the end, compute output
-  caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, top_data);
-  caffe_mul<Dtype>(scale_.count(), top_data, bottom_data, top_data);
+  caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, top_data);
+  caffe_mul < Dtype > (scale_.count(), top_data, bottom_data, top_data);
 }
 
 template <typename Dtype>
-void LRNLayer<Dtype>::WithinChannelForward(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+void LRNLayer<Dtype>::WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
   split_layer_->Forward(bottom, split_top_vec_);
   square_layer_->Forward(square_bottom_vec_, square_top_vec_);
   pool_layer_->Forward(square_top_vec_, pool_top_vec_);
@@ -178,16 +180,15 @@ void LRNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 }
 
 template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelBackward_cpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+void LRNLayer<Dtype>::CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   const Dtype* top_data = top[0]->cpu_data();
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const Dtype* scale_data = scale_.cpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  Blob<Dtype> padded_ratio(1, channels_ + size_ - 1, height_, width_);
-  Blob<Dtype> accum_ratio(1, 1, height_, width_);
+  Blob < Dtype > padded_ratio(1, channels_ + size_ - 1, height_, width_);
+  Blob < Dtype > accum_ratio(1, 1, height_, width_);
   Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data();
   Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data();
   // We hack a little bit by using the diff() to store an additional result
@@ -195,65 +196,129 @@ void LRNLayer<Dtype>::CrossChannelBackward_cpu(
   caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data);
   Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_;
 
-  caffe_powx<Dtype>(scale_.count(), scale_data, -beta_, bottom_diff);
-  caffe_mul<Dtype>(scale_.count(), top_diff, bottom_diff, bottom_diff);
+  caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, bottom_diff);
+  caffe_mul < Dtype > (scale_.count(), top_diff, bottom_diff, bottom_diff);
 
   // go through individual data
   int inverse_pre_pad = size_ - (size_ + 1) / 2;
   for (int n = 0; n < num_; ++n) {
     int block_offset = scale_.offset(n);
     // first, compute diff_i * y_i / s_i
-    caffe_mul<Dtype>(channels_ * height_ * width_,
-        top_diff + block_offset, top_data + block_offset,
-        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
-    caffe_div<Dtype>(channels_ * height_ * width_,
-        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad),
-        scale_data + block_offset,
-        padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad));
+    caffe_mul < Dtype
+        > (channels_ * height_ * width_, top_diff + block_offset, top_data
+            + block_offset, padded_ratio_data
+            + padded_ratio.offset(0, inverse_pre_pad));
+    caffe_div < Dtype
+        > (channels_ * height_ * width_, padded_ratio_data
+            + padded_ratio.offset(0, inverse_pre_pad), scale_data
+            + block_offset, padded_ratio_data
+            + padded_ratio.offset(0, inverse_pre_pad));
     // Now, compute the accumulated ratios and the bottom diff
     caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data);
     for (int c = 0; c < size_ - 1; ++c) {
-      caffe_axpy<Dtype>(height_ * width_, 1.,
-          padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
+      caffe_axpy < Dtype
+          > (height_ * width_, 1., padded_ratio_data
+              + padded_ratio.offset(0, c), accum_ratio_data);
     }
     for (int c = 0; c < channels_; ++c) {
-      caffe_axpy<Dtype>(height_ * width_, 1.,
-          padded_ratio_data + padded_ratio.offset(0, c + size_ - 1),
-          accum_ratio_data);
+      caffe_axpy < Dtype
+          > (height_ * width_, 1., padded_ratio_data
+              + padded_ratio.offset(0, c + size_ - 1), accum_ratio_data);
       // compute bottom diff
-      caffe_mul<Dtype>(height_ * width_,
-          bottom_data + top[0]->offset(n, c),
-          accum_ratio_data, accum_ratio_times_bottom);
-      caffe_axpy<Dtype>(height_ * width_, -cache_ratio_value,
-          accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c));
-      caffe_axpy<Dtype>(height_ * width_, -1.,
-          padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data);
+      caffe_mul < Dtype
+          > (height_ * width_, bottom_data + top[0]->offset(n, c), accum_ratio_data, accum_ratio_times_bottom);
+      caffe_axpy < Dtype
+          > (height_ * width_, -cache_ratio_value, accum_ratio_times_bottom, bottom_diff
+              + top[0]->offset(n, c));
+      caffe_axpy < Dtype
+          > (height_ * width_, -1., padded_ratio_data
+              + padded_ratio.offset(0, c), accum_ratio_data);
     }
   }
 }
 
 template <typename Dtype>
-void LRNLayer<Dtype>::WithinChannelBackward(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+void LRNLayer<Dtype>::WithinChannelBackward(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[0]) {
     vector<bool> product_propagate_down(2, true);
     product_layer_->Backward(top, product_propagate_down, product_bottom_vec_);
     power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_);
     pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_);
     square_layer_->Backward(square_top_vec_, propagate_down,
-                            square_bottom_vec_);
+        square_bottom_vec_);
     split_layer_->Backward(split_top_vec_, propagate_down, bottom);
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void LRNLayer<Dtype>::CrossChannelForward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  // First, compute scale
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  Dtype* scale_data = scale_.mutable_gpu_data();
+  // We will launch one kernel for each pixel location, and have the kernel
+  // go through all the channels.
+  int n_threads = num_ * height_ * width_;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_,
+      alpha_ / size_, k_, scale_data);
+  n_threads = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data);
+}
+
+template <typename Dtype>
+void LRNLayer<Dtype>::CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  int n_threads = num_ * height_ * width_;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
+      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
+      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
+      bottom[0]->mutable_gpu_diff());
+}
+
+template <typename Dtype>
+void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelForward_gpu(bottom, top);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelForward(bottom, top);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
+}
+
+template <typename Dtype>
+void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  switch (this->layer_param_.lrn_param().norm_region()) {
+  case LRNParameter_NormRegion_ACROSS_CHANNELS:
+    CrossChannelBackward_gpu(top, propagate_down, bottom);
+    break;
+  case LRNParameter_NormRegion_WITHIN_CHANNEL:
+    WithinChannelBackward(top, propagate_down, bottom);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization region.";
+  }
+}
+// end: code modified for OpenCL port
+#else
 STUB_GPU(LRNLayer);
 STUB_GPU_FORWARD(LRNLayer, CrossChannelForward);
 STUB_GPU_BACKWARD(LRNLayer, CrossChannelBackward);
 #endif
 
-INSTANTIATE_CLASS(LRNLayer);
-REGISTER_LAYER_CLASS(LRN);
+INSTANTIATE_CLASS (LRNLayer);
+REGISTER_LAYER_CLASS (LRN);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/lrn_layer.cu b/src/caffe/layers/lrn_layer.cu
deleted file mode 100644
index 001b3c34..00000000
--- a/src/caffe/layers/lrn_layer.cu
+++ /dev/null
@@ -1,203 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void LRNFillScale(const int nthreads, const Dtype* const in,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype alpha_over_size,
-    const Dtype k, Dtype* const scale) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    const Dtype* const in_off = in + offset;
-    Dtype* const scale_off = scale + offset;
-    int head = 0;
-    const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    Dtype accum_scale = 0;
-    // fill the scale at [n, :, h, w]
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_scale += in_off[head * step] * in_off[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_scale += in_off[head * step] * in_off[head * step];
-      if (head - size >= 0) {
-        accum_scale -= in_off[(head - size) * step]
-                       * in_off[(head - size) * step];
-      }
-      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_scale -= in_off[(head - size) * step]
-                       * in_off[(head - size) * step];
-      }
-      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
-      ++head;
-    }
-  }
-}
-
-
-template <typename Dtype>
-void LRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelForward_gpu(bottom, top);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelForward(bottom, top);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
-}
-
-// TODO: check if it would be faster to just put it into the previous kernel.
-template <typename Dtype>
-__global__ void LRNComputeOutput(const int nthreads, const Dtype* const in,
-    const Dtype* const scale, const Dtype negative_beta, Dtype* const out) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    out[index] = in[index] * pow(scale[index], negative_beta);
-  }
-}
-
-template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelForward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, compute scale
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  // We will launch one kernel for each pixel location, and have the kernel
-  // go through all the channels.
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNFillScale<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom_data, num_, channels_, height_, width_, size_,
-      alpha_ / size_, k_, scale_data);
-  CUDA_POST_KERNEL_CHECK;
-  n_threads = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeOutput<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom_data, scale_data, -beta_, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-template void LRNLayer<float>::CrossChannelForward_gpu(
-    const vector<Blob<float>*>& bottom, const vector<Blob<float>*>& top);
-template void LRNLayer<double>::CrossChannelForward_gpu(
-    const vector<Blob<double>*>& bottom, const vector<Blob<double>*>& top);
-
-
-template <typename Dtype>
-void LRNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  switch (this->layer_param_.lrn_param().norm_region()) {
-  case LRNParameter_NormRegion_ACROSS_CHANNELS:
-    CrossChannelBackward_gpu(top, propagate_down, bottom);
-    break;
-  case LRNParameter_NormRegion_WITHIN_CHANNEL:
-    WithinChannelBackward(top, propagate_down, bottom);
-    break;
-  default:
-    LOG(FATAL) << "Unknown normalization region.";
-  }
-}
-
-template <typename Dtype>
-__global__ void LRNComputeDiff(const int nthreads,
-    const Dtype* const bottom_data, const Dtype* const top_data,
-    const Dtype* const scale, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int size, const Dtype negative_beta,
-    const Dtype cache_ratio, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int n = index / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    const int step = height * width;
-    const Dtype* const bottom_off = bottom_data + offset;
-    const Dtype* const top_off = top_data + offset;
-    const Dtype* const scale_off = scale + offset;
-    const Dtype* const top_diff_off = top_diff + offset;
-    Dtype* const bottom_diff_off = bottom_diff + offset;
-    int head = 0;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-    Dtype accum_ratio = 0;
-    // accumulate values
-    while (head < post_pad && head < channels) {
-      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
-          scale_off[head * step];
-      ++head;
-    }
-    // both add and subtract
-    while (head < channels) {
-      accum_ratio += top_diff_off[head * step] * top_off[head * step] /
-          scale_off[head * step];
-      if (head - size >= 0) {
-        accum_ratio -= top_diff_off[(head - size) * step] *
-            top_off[(head - size) * step] / scale_off[(head - size) * step];
-      }
-      bottom_diff_off[(head - post_pad) * step] =
-          top_diff_off[(head - post_pad) * step]
-            * pow(scale_off[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-    // subtract only
-    while (head < channels + post_pad) {
-      if (head - size >= 0) {
-        accum_ratio -= top_diff_off[(head - size) * step] *
-            top_off[(head - size) * step] / scale_off[(head - size) * step];
-      }
-      bottom_diff_off[(head - post_pad) * step] =
-          top_diff_off[(head - post_pad) * step]
-            * pow(scale_off[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
-      ++head;
-    }
-  }
-}
-
-template <typename Dtype>
-void LRNLayer<Dtype>::CrossChannelBackward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  int n_threads = num_ * height_ * width_;
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  LRNComputeDiff<<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS>>>(
-      n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(),
-      scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_,
-      size_, -beta_, Dtype(2. * alpha_ * beta_ / size_),
-      bottom[0]->mutable_gpu_diff());
-}
-template void LRNLayer<float>::CrossChannelBackward_gpu(
-    const vector<Blob<float>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<float>*>& bottom);
-template void LRNLayer<double>::CrossChannelBackward_gpu(
-    const vector<Blob<double>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<double>*>& bottom);
-
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(LRNLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp
index 42de4198..eff0129c 100644
--- a/src/caffe/layers/memory_data_layer.cpp
+++ b/src/caffe/layers/memory_data_layer.cpp
@@ -10,15 +10,15 @@ namespace caffe {
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-     const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   batch_size_ = this->layer_param_.memory_data_param().batch_size();
   channels_ = this->layer_param_.memory_data_param().channels();
   height_ = this->layer_param_.memory_data_param().height();
   width_ = this->layer_param_.memory_data_param().width();
   size_ = channels_ * height_ * width_;
-  CHECK_GT(batch_size_ * size_, 0) <<
-      "batch_size, channels, height, and width must be specified and"
-      " positive in memory_data_param";
+  CHECK_GT(batch_size_ * size_, 0)
+      << "batch_size, channels, height, and width must be specified and"
+          " positive in memory_data_param";
   vector<int> label_shape(1, batch_size_);
   top[0]->Reshape(batch_size_, channels_, height_, width_);
   top[1]->Reshape(label_shape);
@@ -32,12 +32,12 @@ void MemoryDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::AddDatumVector(const vector<Datum>& datum_vector) {
-  CHECK(!has_new_data_) <<
-      "Can't add data until current data has been consumed.";
+  CHECK(!has_new_data_)
+      << "Can't add data until current data has been consumed.";
   size_t num = datum_vector.size();
   CHECK_GT(num, 0) << "There is no datum to add.";
-  CHECK_EQ(num % batch_size_, 0) <<
-      "The added data must be a multiple of the batch size.";
+  CHECK_EQ(num % batch_size_, 0)
+      << "The added data must be a multiple of the batch size.";
   added_data_.Reshape(num, channels_, height_, width_);
   added_label_.Reshape(num, 1, 1, 1);
   // Apply data transformations (mirror, scale, crop...)
@@ -57,11 +57,11 @@ template <typename Dtype>
 void MemoryDataLayer<Dtype>::AddMatVector(const vector<cv::Mat>& mat_vector,
     const vector<int>& labels) {
   size_t num = mat_vector.size();
-  CHECK(!has_new_data_) <<
-      "Can't add mat until current data has been consumed.";
+  CHECK(!has_new_data_)
+      << "Can't add mat until current data has been consumed.";
   CHECK_GT(num, 0) << "There is no mat to add";
-  CHECK_EQ(num % batch_size_, 0) <<
-      "The added data must be a multiple of the batch size.";
+  CHECK_EQ(num % batch_size_, 0)
+      << "The added data must be a multiple of the batch size.";
   added_data_.Reshape(num, channels_, height_, width_);
   added_label_.Reshape(num, 1, 1, 1);
   // Apply data transformations (mirror, scale, crop...)
@@ -95,8 +95,8 @@ void MemoryDataLayer<Dtype>::Reset(Dtype* data, Dtype* labels, int n) {
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::set_batch_size(int new_size) {
-  CHECK(!has_new_data_) <<
-      "Can't change batch_size until current data has been consumed.";
+  CHECK(!has_new_data_)
+      << "Can't change batch_size until current data has been consumed.";
   batch_size_ = new_size;
   added_data_.Reshape(batch_size_, channels_, height_, width_);
   added_label_.Reshape(batch_size_, 1, 1, 1);
@@ -104,7 +104,7 @@ void MemoryDataLayer<Dtype>::set_batch_size(int new_size) {
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset";
   top[0]->Reshape(batch_size_, channels_, height_, width_);
   top[1]->Reshape(batch_size_, 1, 1, 1);
@@ -115,7 +115,7 @@ void MemoryDataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     has_new_data_ = false;
 }
 
-INSTANTIATE_CLASS(MemoryDataLayer);
-REGISTER_LAYER_CLASS(MemoryData);
+INSTANTIATE_CLASS (MemoryDataLayer);
+REGISTER_LAYER_CLASS (MemoryData);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
index 4267a594..4d8b69bc 100644
--- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp
+++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp
@@ -13,7 +13,7 @@ namespace caffe {
 template <typename Dtype>
 void MultinomialLogisticLossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::Reshape(bottom, top);
+  LossLayer < Dtype > ::Reshape(bottom, top);
   CHECK_EQ(bottom[1]->channels(), 1);
   CHECK_EQ(bottom[1]->height(), 1);
   CHECK_EQ(bottom[1]->width(), 1);
@@ -29,8 +29,7 @@ void MultinomialLogisticLossLayer<Dtype>::Forward_cpu(
   Dtype loss = 0;
   for (int i = 0; i < num; ++i) {
     int label = static_cast<int>(bottom_label[i]);
-    Dtype prob = std::max(
-        bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
+    Dtype prob = std::max(bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
     loss -= log(prob);
   }
   top[0]->mutable_cpu_data()[0] = loss / num;
@@ -42,7 +41,7 @@ void MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
     const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[1]) {
     LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
+        << " Layer cannot backpropagate to label inputs.";
   }
   if (propagate_down[0]) {
     const Dtype* bottom_data = bottom[0]->cpu_data();
@@ -51,17 +50,17 @@ void MultinomialLogisticLossLayer<Dtype>::Backward_cpu(
     int num = bottom[0]->num();
     int dim = bottom[0]->count() / bottom[0]->num();
     caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
-    const Dtype scale = - top[0]->cpu_diff()[0] / num;
+    const Dtype scale = -top[0]->cpu_diff()[0] / num;
     for (int i = 0; i < num; ++i) {
       int label = static_cast<int>(bottom_label[i]);
-      Dtype prob = std::max(
-          bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD));
+      Dtype prob = std::max(bottom_data[i * dim + label],
+          Dtype(kLOG_THRESHOLD));
       bottom_diff[i * dim + label] = scale / prob;
     }
   }
 }
 
-INSTANTIATE_CLASS(MultinomialLogisticLossLayer);
-REGISTER_LAYER_CLASS(MultinomialLogisticLoss);
+INSTANTIATE_CLASS (MultinomialLogisticLossLayer);
+REGISTER_LAYER_CLASS (MultinomialLogisticLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp
index 3e79bddc..d64f5670 100644
--- a/src/caffe/layers/mvn_layer.cpp
+++ b/src/caffe/layers/mvn_layer.cpp
@@ -9,17 +9,14 @@ namespace caffe {
 
 template <typename Dtype>
 void MVNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
-      bottom[0]->height(), bottom[0]->width());
-  mean_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-      1, 1);
-  variance_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-      1, 1);
-  temp_.Reshape(bottom[0]->num(), bottom[0]->channels(),
-      bottom[0]->height(), bottom[0]->width());
-  sum_multiplier_.Reshape(1, 1,
-      bottom[0]->height(), bottom[0]->width());
+    const vector<Blob<Dtype>*>& top) {
+  top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(), bottom[0]->height(),
+      bottom[0]->width());
+  mean_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
+  variance_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
+  temp_.Reshape(bottom[0]->num(), bottom[0]->channels(), bottom[0]->height(),
+      bottom[0]->width());
+  sum_multiplier_.Reshape(1, 1, bottom[0]->height(), bottom[0]->width());
   Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
   caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
   eps_ = this->layer_param_.mvn_param().eps();
@@ -44,11 +41,10 @@ void MVNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
         temp_.mutable_cpu_data());
 
     // computes variance using var(X) = E(X^2) - (EX)^2
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-        sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(),
-        sum_multiplier_.cpu_data(), 0.,
-        variance_.mutable_cpu_data());  // E(X^2)
+    caffe_cpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX
+    caffe_cpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(), sum_multiplier_.cpu_data(), 0., variance_.mutable_cpu_data()); // E(X^2)
     caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2),
         temp_.mutable_cpu_data());  // (EX)^2
     caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(),
@@ -56,31 +52,28 @@ void MVNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
     // do mean and variance normalization
     // subtract mean
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-            temp_.mutable_cpu_data());
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
 
     caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
 
     // normalize variance
     caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
-          variance_.mutable_cpu_data());
+        variance_.mutable_cpu_data());
 
     caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
 
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-          temp_.mutable_cpu_data());
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
 
     caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data);
   } else {
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-            sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());  // EX
+    caffe_cpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX
 
     // subtract mean
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-            temp_.mutable_cpu_data());
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
 
     caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data);
   }
@@ -88,8 +81,7 @@ void MVNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   const Dtype* top_data = top[0]->cpu_data();
   const Dtype* bottom_data = bottom[0]->cpu_data();
@@ -105,28 +97,24 @@ void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
   if (this->layer_param_.mvn_param().normalize_variance()) {
     caffe_mul(temp_.count(), top_data, top_diff, bottom_diff);
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., bottom_diff,
-          sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-          bottom_diff);
+    caffe_cpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1., bottom_diff, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., bottom_diff);
     caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
 
-    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_diff,
-            sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-            mean_.cpu_data(), sum_multiplier_.cpu_data(), 1.,
-            bottom_diff);
+    caffe_cpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1., top_diff, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 1., bottom_diff);
 
     caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
         bottom_diff);
 
     // put the squares of bottom into temp_
-    caffe_powx(temp_.count(), bottom_data, Dtype(2),
-        temp_.mutable_cpu_data());
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-        variance_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
-        temp_.mutable_cpu_data());
+    caffe_powx(temp_.count(), bottom_data, Dtype(2), temp_.mutable_cpu_data());
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data());
 
     caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
   } else {
@@ -134,12 +122,120 @@ void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  int num;
+  if (this->layer_param_.mvn_param().across_channels())
+    num = bottom[0]->num();
+  else
+    num = bottom[0]->num() * bottom[0]->channels();
+
+  int dim = bottom[0]->count() / num;
+
+  if (this->layer_param_.mvn_param().normalize_variance()) {
+    // put the squares of bottom into temp_
+    caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
+        temp_.mutable_gpu_data());
+
+    // computes variance using var(X) = E(X^2) - (EX)^2
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), sum_multiplier_.gpu_data(), 0., variance_.mutable_gpu_data()); // E(X^2)
+    caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
+        temp_.mutable_gpu_data());  // (EX)^2
+    caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
+        variance_.mutable_gpu_data());  // variance
+
+    // do mean and variance normalization
+    // subtract mean
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
+
+    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
+
+    // normalize variance
+    caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
+        variance_.mutable_gpu_data());
+
+    caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
+
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
+
+    caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
+  } else {
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX
+
+    // subtract mean
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
+
+    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
+  }
+}
+
+template <typename Dtype>
+void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* top_data = top[0]->gpu_data();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+
+  int num;
+  if (this->layer_param_.mvn_param().across_channels())
+    num = bottom[0]->num();
+  else
+    num = bottom[0]->num() * bottom[0]->channels();
+
+  int dim = bottom[0]->count() / num;
+
+  if (this->layer_param_.mvn_param().normalize_variance()) {
+    caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1., bottom_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., bottom_diff);
+    caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
+
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1., top_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., bottom_diff);
+
+    caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
+        bottom_diff);
+
+    // put the squares of bottom into temp_
+    caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2),
+        temp_.mutable_gpu_data());
+
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
+
+    caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
+  } else {
+    caffe_gpu_gemv < Dtype
+        > (CblasNoTrans, num, dim, 1. / dim, top_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+    caffe_gpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data());
+    caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff);
+  }
+}
+// end: code modified for OpenCL port
 
-#ifdef CPU_ONLY
+#else 
 STUB_GPU(MVNLayer);
 #endif
 
-INSTANTIATE_CLASS(MVNLayer);
-REGISTER_LAYER_CLASS(MVN);
+INSTANTIATE_CLASS (MVNLayer);
+REGISTER_LAYER_CLASS (MVN);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/mvn_layer.cu b/src/caffe/layers/mvn_layer.cu
deleted file mode 100644
index 3888a0c7..00000000
--- a/src/caffe/layers/mvn_layer.cu
+++ /dev/null
@@ -1,124 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/common_layers.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void MVNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    // put the squares of bottom into temp_
-    caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
-        temp_.mutable_gpu_data());
-
-    // computes variance using var(X) = E(X^2) - (EX)^2
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-        sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(),
-        sum_multiplier_.gpu_data(), 0.,
-        variance_.mutable_gpu_data());  // E(X^2)
-    caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
-        temp_.mutable_gpu_data());  // (EX)^2
-    caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
-        variance_.mutable_gpu_data());  // variance
-
-    // do mean and variance normalization
-    // subtract mean
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-
-    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
-
-    // normalize variance
-    caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
-          variance_.mutable_gpu_data());
-
-    caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-          temp_.mutable_gpu_data());
-
-    caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data);
-  } else {
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, bottom_data,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());  // EX
-
-    // subtract mean
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-            temp_.mutable_gpu_data());
-
-    caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data);
-  }
-}
-
-template <typename Dtype>
-void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-
-  int num;
-  if (this->layer_param_.mvn_param().across_channels())
-    num = bottom[0]->num();
-  else
-    num = bottom[0]->num() * bottom[0]->channels();
-
-  int dim = bottom[0]->count() / num;
-
-  if (this->layer_param_.mvn_param().normalize_variance()) {
-    caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff);
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., bottom_diff,
-          sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-          mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-          bottom_diff);
-    caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff);
-
-    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1., top_diff,
-            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-            mean_.gpu_data(), sum_multiplier_.gpu_data(), 1.,
-            bottom_diff);
-
-    caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim),
-        bottom_diff);
-
-    // put the squares of bottom into temp_
-    caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2),
-        temp_.mutable_gpu_data());
-
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, 1.,
-        variance_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
-        temp_.mutable_gpu_data());
-
-    caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
-  } else {
-    caffe_copy(temp_.count(), top_diff, bottom_diff);
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(MVNLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/neuron_layer.cpp b/src/caffe/layers/neuron_layer.cpp
index ba67b438..4fa61aad 100644
--- a/src/caffe/layers/neuron_layer.cpp
+++ b/src/caffe/layers/neuron_layer.cpp
@@ -7,10 +7,10 @@ namespace caffe {
 
 template <typename Dtype>
 void NeuronLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   top[0]->ReshapeLike(*bottom[0]);
 }
 
-INSTANTIATE_CLASS(NeuronLayer);
+INSTANTIATE_CLASS (NeuronLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index c8d41499..812ffbb3 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -15,27 +15,31 @@ using std::max;
 
 template <typename Dtype>
 void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   PoolingParameter pool_param = this->layer_param_.pooling_param();
   if (pool_param.global_pooling()) {
-    CHECK(!(pool_param.has_kernel_size() ||
-      pool_param.has_kernel_h() || pool_param.has_kernel_w()))
-      << "With Global_pooling: true Filter size cannot specified";
+    CHECK(
+        !(pool_param.has_kernel_size() || pool_param.has_kernel_h()
+            || pool_param.has_kernel_w()))
+        << "With Global_pooling: true Filter size cannot specified";
   } else {
-    CHECK(!pool_param.has_kernel_size() !=
-      !(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
-      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-    CHECK(pool_param.has_kernel_size() ||
-      (pool_param.has_kernel_h() && pool_param.has_kernel_w()))
-      << "For non-square filters both kernel_h and kernel_w are required.";
+    CHECK(
+        !pool_param.has_kernel_size()
+            != !(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+        << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+    CHECK(
+        pool_param.has_kernel_size()
+            || (pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+        << "For non-square filters both kernel_h and kernel_w are required.";
   }
-  CHECK((!pool_param.has_pad() && pool_param.has_pad_h()
-      && pool_param.has_pad_w())
-      || (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
+  CHECK(
+      (!pool_param.has_pad() && pool_param.has_pad_h() && pool_param.has_pad_w())
+          || (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
       << "pad is pad OR pad_h and pad_w are required.";
-  CHECK((!pool_param.has_stride() && pool_param.has_stride_h()
-      && pool_param.has_stride_w())
-      || (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
+  CHECK(
+      (!pool_param.has_stride() && pool_param.has_stride_h()
+          && pool_param.has_stride_w())
+          || (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
       << "Stride is stride OR stride_h and stride_w are required.";
   global_pooling_ = pool_param.global_pooling();
   if (global_pooling_) {
@@ -65,13 +69,14 @@ void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   }
   if (global_pooling_) {
     CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
-      << "With Global_pooling: true; only pad = 0 and stride = 1";
+        << "With Global_pooling: true; only pad = 0 and stride = 1";
   }
   if (pad_h_ != 0 || pad_w_ != 0) {
-    CHECK(this->layer_param_.pooling_param().pool()
-        == PoolingParameter_PoolMethod_AVE
-        || this->layer_param_.pooling_param().pool()
-        == PoolingParameter_PoolMethod_MAX)
+    CHECK(
+        this->layer_param_.pooling_param().pool()
+            == PoolingParameter_PoolMethod_AVE
+            || this->layer_param_.pooling_param().pool()
+                == PoolingParameter_PoolMethod_MAX)
         << "Padding implemented only for average and max pooling.";
     CHECK_LT(pad_h_, kernel_h_);
     CHECK_LT(pad_w_, kernel_w_);
@@ -80,7 +85,7 @@ void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
       << "corresponding to (num, channels, height, width)";
   channels_ = bottom[0]->channels();
@@ -90,10 +95,10 @@ void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     kernel_h_ = bottom[0]->height();
     kernel_w_ = bottom[0]->width();
   }
-  pooled_height_ = static_cast<int>(ceil(static_cast<float>(
-      height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
-  pooled_width_ = static_cast<int>(ceil(static_cast<float>(
-      width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
+  pooled_height_ = static_cast<int>(ceil(
+      static_cast<float>(height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
+  pooled_width_ = static_cast<int>(ceil(
+      static_cast<float>(width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
   if (pad_h_ || pad_w_) {
     // If we have padding, ensure that the last pooling starts strictly
     // inside the image (instead of at the padding); otherwise clip the last.
@@ -106,22 +111,21 @@ void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_);
     CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_);
   }
-  top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_,
-      pooled_width_);
+  top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_, pooled_width_);
   if (top.size() > 1) {
     top[1]->ReshapeLike(*top[0]);
   }
   // If max pooling, we will initialize the vector index part.
-  if (this->layer_param_.pooling_param().pool() ==
-      PoolingParameter_PoolMethod_MAX && top.size() == 1) {
+  if (this->layer_param_.pooling_param().pool()
+      == PoolingParameter_PoolMethod_MAX && top.size() == 1) {
     max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
         pooled_width_);
   }
   // If stochastic pooling, we will initialize the random index part.
-  if (this->layer_param_.pooling_param().pool() ==
-      PoolingParameter_PoolMethod_STOCHASTIC) {
+  if (this->layer_param_.pooling_param().pool()
+      == PoolingParameter_PoolMethod_STOCHASTIC) {
     rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
-      pooled_width_);
+        pooled_width_);
   }
 }
 
@@ -129,7 +133,7 @@ void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 // case?
 template <typename Dtype>
 void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
   const int top_count = top[0]->count();
@@ -231,7 +235,7 @@ void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (!propagate_down[0]) {
     return;
   }
@@ -289,8 +293,8 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
             wend = min(wend, width_);
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
-                bottom_diff[h * width_ + w] +=
-                  top_diff[ph * pooled_width_ + pw] / pool_size;
+                bottom_diff[h * width_ + w] += top_diff[ph * pooled_width_ + pw]
+                    / pool_size;
               }
             }
           }
@@ -309,11 +313,106 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  //Forward_cpu(bottom, top);
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  int count = top[0]->count();
+  // We'll output the mask to top[1] if it's of size >1.
+  const bool use_top_mask = top.size() > 1;
+  int* mask = NULL;
+  Dtype* top_mask = NULL;
+  switch (this->layer_param_.pooling_param().pool()) {
+  case PoolingParameter_PoolMethod_MAX:
+    if (use_top_mask) {
+      top_mask = top[1]->mutable_gpu_data();
+    } else {
+      mask = max_idx_.mutable_gpu_data();
+    }
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_, height_,
+        width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_,
+        stride_w_, pad_h_, pad_w_, top_data, mask, top_mask);
+    break;
+  case PoolingParameter_PoolMethod_AVE:
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AvePoolForward(count, bottom_data, bottom[0]->num(), channels_, height_,
+        width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_,
+        stride_w_, pad_h_, pad_w_, top_data);
+    break;
+  case PoolingParameter_PoolMethod_STOCHASTIC:
+    if (this->phase_ == TRAIN) {
+      // We need to create the random index as well.
+      caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
+          rand_idx_.mutable_gpu_data());
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_,
+          height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_,
+          stride_h_, stride_w_, rand_idx_.mutable_gpu_data(), top_data);
+    } else {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_,
+          height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_,
+          stride_h_, stride_w_, top_data);
+    }
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
+}
 
-#ifdef CPU_ONLY
+template <typename Dtype>
+void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  //Backward_cpu(top, propagate_down, bottom);
+  if (!propagate_down[0]) {
+    return;
+  }
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  const int count = bottom[0]->count();
+  caffe_gpu_set(count, Dtype(0.), bottom_diff);
+  // We'll output the mask to top[1] if it's of size >1.
+  const bool use_top_mask = top.size() > 1;
+  const int* mask = NULL;
+  const Dtype* top_mask = NULL;
+  switch (this->layer_param_.pooling_param().pool()) {
+  case PoolingParameter_PoolMethod_MAX:
+    if (use_top_mask) {
+      top_mask = top[1]->gpu_data();
+    } else {
+      mask = max_idx_.gpu_data();
+    }
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_,
+        height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_,
+        stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
+    break;
+  case PoolingParameter_PoolMethod_AVE:
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AvePoolBackward(count, top_diff, top[0]->num(), channels_, height_, width_,
+        pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_,
+        stride_w_, pad_h_, pad_w_, bottom_diff);
+    break;
+  case PoolingParameter_PoolMethod_STOCHASTIC:
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    StoPoolBackward(count, rand_idx_.gpu_data(), top_diff, top[0]->num(),
+        channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_,
+        kernel_w_, stride_h_, stride_w_, bottom_diff);
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
+}
+// end: code modified for OpenCL port
+#else
 STUB_GPU(PoolingLayer);
 #endif
 
-INSTANTIATE_CLASS(PoolingLayer);
+INSTANTIATE_CLASS (PoolingLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
deleted file mode 100644
index ca4b13f7..00000000
--- a/src/caffe/layers/pooling_layer.cu
+++ /dev/null
@@ -1,387 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void MaxPoolForward(const int nthreads,
-    const Dtype* const bottom_data, const int num, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const top_data, int* mask, Dtype* top_mask) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    const int hend = min(hstart + kernel_h, height);
-    const int wend = min(wstart + kernel_w, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        if (bottom_slice[h * width + w] > maxval) {
-          maxidx = h * width + w;
-          maxval = bottom_slice[maxidx];
-        }
-      }
-    }
-    top_data[index] = maxval;
-    if (mask) {
-      mask[index] = maxidx;
-    } else {
-      top_mask[index] = maxidx;
-    }
-  }
-}
-
-template <typename Dtype>
-__global__ void AvePoolForward(const int nthreads,
-    const Dtype* const bottom_data, const int num, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    int hend = min(hstart + kernel_h, height + pad_h);
-    int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (hend - hstart) * (wend - wstart);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    hend = min(hend, height);
-    wend = min(wend, width);
-    Dtype aveval = 0;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        aveval += bottom_slice[h * width + w];
-      }
-    }
-    top_data[index] = aveval / pool_size;
-  }
-}
-
-template <typename Dtype>
-__global__ void StoPoolForwardTrain(const int nthreads,
-    const Dtype* const bottom_data,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const rand_idx, Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    Dtype cumsum = 0.;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-      }
-    }
-    const float thres = rand_idx[index] * cumsum;
-    // Second pass: get value, and set index.
-    cumsum = 0;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        if (cumsum >= thres) {
-          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
-          top_data[index] = bottom_slice[h * width + w];
-          return;
-        }
-      }
-    }
-  }
-}
-
-
-template <typename Dtype>
-__global__ void StoPoolForwardTest(const int nthreads,
-    const Dtype* const bottom_data,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const top_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    const int hstart = ph * stride_h;
-    const int hend = min(hstart + kernel_h, height);
-    const int wstart = pw * stride_w;
-    const int wend = min(wstart + kernel_w, width);
-    // We set cumsum to be 0 to avoid divide-by-zero problems
-    Dtype cumsum = FLT_MIN;
-    Dtype cumvalues = 0.;
-    const Dtype* const bottom_slice =
-        bottom_data + (n * channels + c) * height * width;
-    // First pass: get sum
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        cumsum += bottom_slice[h * width + w];
-        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
-      }
-    }
-    top_data[index] = cumvalues / cumsum;
-  }
-}
-
-
-template <typename Dtype>
-void PoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  int count = top[0]->count();
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  int* mask = NULL;
-  Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    if (use_top_mask) {
-      top_mask = top[1]->mutable_gpu_data();
-    } else {
-      mask = max_idx_.mutable_gpu_data();
-    }
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data,
-        mask, top_mask);
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, bottom_data, bottom[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data);
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    if (this->phase_ == TRAIN) {
-      // We need to create the random index as well.
-      caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1),
-                            rand_idx_.mutable_gpu_data());
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTrain<Dtype><<<CAFFE_GET_BLOCKS(count),
-                                   CAFFE_CUDA_NUM_THREADS>>>(
-          count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_,
-          rand_idx_.mutable_gpu_data(), top_data);
-    } else {
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      StoPoolForwardTest<Dtype><<<CAFFE_GET_BLOCKS(count),
-                                  CAFFE_CUDA_NUM_THREADS>>>(
-          count, bottom_data, bottom[0]->num(), channels_,
-          height_, width_, pooled_height_, pooled_width_, kernel_h_,
-          kernel_w_, stride_h_, stride_w_, top_data);
-    }
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-template <typename Dtype>
-__global__ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
-    const int* const mask, const Dtype* const top_mask, const int num,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
-    const int pad_w, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart =
-         (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    const int pwstart =
-         (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const int offset = (n * channels + c) * pooled_height * pooled_width;
-    const Dtype* const top_diff_slice = top_diff + offset;
-    if (mask) {
-      const int* const mask_slice = mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (mask_slice[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff_slice[ph * pooled_width + pw];
-          }
-        }
-      }
-    } else {
-      const Dtype* const top_mask_slice = top_mask + offset;
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (top_mask_slice[ph * pooled_width + pw] == h * width + w) {
-            gradient += top_diff_slice[ph * pooled_width + pw];
-          }
-        }
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-template <typename Dtype>
-__global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_h, const int pad_w,
-    Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width + pad_w;
-    const int h = (index / width) % height + pad_h;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-    const int phend = min(h / stride_h + 1, pooled_height);
-    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-    const int pwend = min(w / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const Dtype* const top_diff_slice =
-        top_diff + (n * channels + c) * pooled_height * pooled_width;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        // figure out the pooling size
-        int hstart = ph * stride_h - pad_h;
-        int wstart = pw * stride_w - pad_w;
-        int hend = min(hstart + kernel_h, height + pad_h);
-        int wend = min(wstart + kernel_w, width + pad_w);
-        int pool_size = (hend - hstart) * (wend - wstart);
-        gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-
-template <typename Dtype>
-__global__ void StoPoolBackward(const int nthreads,
-    const Dtype* const rand_idx, const Dtype* const top_diff,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, Dtype* const bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
-    const int phend = min(h / stride_h + 1, pooled_height);
-    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
-    const int pwend = min(w / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
-    const Dtype* const rand_idx_slice =
-        rand_idx + (n * channels + c) * pooled_height * pooled_width;
-    const Dtype* const top_diff_slice =
-        top_diff + (n * channels + c) * pooled_height * pooled_width;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        gradient += top_diff_slice[ph * pooled_width + pw] *
-            (index == static_cast<int>(rand_idx_slice[ph * pooled_width + pw]));
-      }
-    }
-    bottom_diff[index] = gradient;
-  }
-}
-
-
-template <typename Dtype>
-void PoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) {
-    return;
-  }
-  const Dtype* top_diff = top[0]->gpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  const int count = bottom[0]->count();
-  caffe_gpu_set(count, Dtype(0.), bottom_diff);
-  // We'll output the mask to top[1] if it's of size >1.
-  const bool use_top_mask = top.size() > 1;
-  const int* mask = NULL;
-  const Dtype* top_mask = NULL;
-  switch (this->layer_param_.pooling_param().pool()) {
-  case PoolingParameter_PoolMethod_MAX:
-    if (use_top_mask) {
-      top_mask = top[1]->gpu_data();
-    } else {
-      mask = max_idx_.gpu_data();
-    }
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    MaxPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, mask, top_mask, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_,
-        kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_,
-        bottom_diff);
-    break;
-  case PoolingParameter_PoolMethod_AVE:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    AvePoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top[0]->num(), channels_,
-        height_, width_, pooled_height_, pooled_width_, kernel_h_,
-        kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff);
-    break;
-  case PoolingParameter_PoolMethod_STOCHASTIC:
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    StoPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, rand_idx_.gpu_data(), top_diff,
-        top[0]->num(), channels_, height_, width_, pooled_height_,
-        pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_,
-        bottom_diff);
-    break;
-  default:
-    LOG(FATAL) << "Unknown pooling method.";
-  }
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(PoolingLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp
index 4fe34c49..c3cb1759 100644
--- a/src/caffe/layers/power_layer.cpp
+++ b/src/caffe/layers/power_layer.cpp
@@ -4,17 +4,19 @@
 #include "caffe/layer.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/vision_layers.hpp"
+#include "caffe/util/ocl_util.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
 
 namespace caffe {
 
 template <typename Dtype>
 void PowerLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
   power_ = this->layer_param_.power_param().power();
   scale_ = this->layer_param_.power_param().scale();
   shift_ = this->layer_param_.power_param().shift();
-  diff_scale_ = power_  * scale_;
+  diff_scale_ = power_ * scale_;
 }
 
 // Compute y = (shift + scale * x)^power
@@ -44,8 +46,7 @@ void PowerLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[0]) {
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
     const int count = bottom[0]->count();
@@ -60,8 +61,8 @@ void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
         // Special case for y = (shift + scale * x)^2
         //     -> dy/dx = 2 * scale * (shift + scale * x)
         //              = diff_scale * shift + diff_scale * scale * x
-        caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data,
-            Dtype(0), bottom_diff);
+        caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0),
+            bottom_diff);
         if (shift_ != Dtype(0)) {
           caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff);
         }
@@ -82,7 +83,7 @@ void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
           caffe_add_scalar(count, shift_, bottom_diff);
         }
         const Dtype* top_data = top[0]->cpu_data();
-        caffe_div<Dtype>(count, top_data, bottom_diff, bottom_diff);
+        caffe_div < Dtype > (count, top_data, bottom_diff, bottom_diff);
         if (diff_scale_ != Dtype(1)) {
           caffe_scal(count, diff_scale_, bottom_diff);
         }
@@ -94,11 +95,86 @@ void PowerLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // Special case where we can ignore the input: scale or power is 0.
+  if (diff_scale_ == Dtype(0)) {
+    Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
+    ocl_memset(top_data, value, count);
+    return;
+  }
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  caffe_gpu_copy(count, bottom_data, top_data);
+  if (scale_ != Dtype(1)) {
+    caffe_gpu_scal(count, scale_, top_data);
+  }
+  if (shift_ != Dtype(0)) {
+    caffe_gpu_add_scalar(count, shift_, top_data);
+  }
+  if (power_ != Dtype(1)) {
+    caffe_gpu_powx(count, top_data, power_, top_data);
+  }
+}
+
+template <typename Dtype>
+void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
+      ocl_memset(bottom_diff, diff_scale_, count);
+    } else {
+      const Dtype* bottom_data = bottom[0]->gpu_data();
+      // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
+      //               = diff_scale * y / (shift + scale * x)
+      if (power_ == Dtype(2)) {
+        // Special case for y = (shift + scale * x)^2
+        //     -> dy/dx = 2 * scale * (shift + scale * x)
+        //              = diff_scale * shift + diff_scale * scale * x
+        caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0),
+            bottom_diff);
+        if (shift_ != Dtype(0)) {
+          caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
+        }
+      } else if (shift_ == Dtype(0)) {
+        // Special case for y = (scale * x)^power
+        //     -> dy/dx = scale * power * (scale * x)^(power - 1)
+        //              = scale * power * (scale * x)^power * (scale * x)^(-1)
+        //              = power * y / x
+        const Dtype* top_data = top[0]->gpu_data();
+        caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
+        caffe_gpu_scal(count, power_, bottom_diff);
+      } else {
+        caffe_gpu_copy(count, bottom_data, bottom_diff);
+        if (scale_ != Dtype(1)) {
+          caffe_gpu_scal(count, scale_, bottom_diff);
+        }
+        if (shift_ != Dtype(0)) {
+          caffe_gpu_add_scalar(count, shift_, bottom_diff);
+        }
+        const Dtype* top_data = top[0]->gpu_data();
+        caffe_gpu_div(count, top_data, bottom_diff, bottom_diff);
+        if (diff_scale_ != Dtype(1)) {
+          caffe_gpu_scal(count, diff_scale_, bottom_diff);
+        }
+      }
+    }
+    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
+  }
+}
+// begin: code modified for OpenCL port
+#else
 STUB_GPU(PowerLayer);
 #endif
 
-INSTANTIATE_CLASS(PowerLayer);
-REGISTER_LAYER_CLASS(Power);
+INSTANTIATE_CLASS (PowerLayer);
+REGISTER_LAYER_CLASS (Power);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/power_layer.cu b/src/caffe/layers/power_layer.cu
deleted file mode 100644
index 90d94405..00000000
--- a/src/caffe/layers/power_layer.cu
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void PowerLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // Special case where we can ignore the input: scale or power is 0.
-  if (diff_scale_ == Dtype(0)) {
-    Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_);
-    caffe_gpu_set(count, value, top_data);
-    return;
-  }
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  caffe_copy(count, bottom_data, top_data);
-  if (scale_ != Dtype(1)) {
-    caffe_gpu_scal(count, scale_, top_data);
-  }
-  if (shift_ != Dtype(0)) {
-    caffe_gpu_add_scalar(count, shift_, top_data);
-  }
-  if (power_ != Dtype(1)) {
-    caffe_gpu_powx(count, top_data, power_, top_data);
-  }
-}
-
-template <typename Dtype>
-void PowerLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) {
-      caffe_gpu_set(count, diff_scale_, bottom_diff);
-    } else {
-      const Dtype* bottom_data = bottom[0]->gpu_data();
-      // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1)
-      //               = diff_scale * y / (shift + scale * x)
-      if (power_ == Dtype(2)) {
-        // Special case for y = (shift + scale * x)^2
-        //     -> dy/dx = 2 * scale * (shift + scale * x)
-        //              = diff_scale * shift + diff_scale * scale * x
-        caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data,
-            Dtype(0), bottom_diff);
-        if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff);
-        }
-      } else if (shift_ == Dtype(0)) {
-        // Special case for y = (scale * x)^power
-        //     -> dy/dx = scale * power * (scale * x)^(power - 1)
-        //              = scale * power * (scale * x)^power * (scale * x)^(-1)
-        //              = power * y / x
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div(count, top_data, bottom_data, bottom_diff);
-        caffe_gpu_scal(count, power_, bottom_diff);
-      } else {
-        caffe_copy(count, bottom_data, bottom_diff);
-        if (scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, scale_, bottom_diff);
-        }
-        if (shift_ != Dtype(0)) {
-          caffe_gpu_add_scalar(count, shift_, bottom_diff);
-        }
-        const Dtype* top_data = top[0]->gpu_data();
-        caffe_gpu_div<Dtype>(count, top_data, bottom_diff, bottom_diff);
-        if (diff_scale_ != Dtype(1)) {
-          caffe_gpu_scal(count, diff_scale_, bottom_diff);
-        }
-      }
-    }
-    caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(PowerLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index 81831755..55f2e303 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -24,14 +24,14 @@ void PReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     } else {
       this->blobs_[0].reset(new Blob<Dtype>(vector<int>(1, channels)));
     }
-    shared_ptr<Filler<Dtype> > filler;
+    shared_ptr < Filler<Dtype> > filler;
     if (prelu_param.has_filler()) {
-      filler.reset(GetFiller<Dtype>(prelu_param.filler()));
+      filler.reset(GetFiller < Dtype > (prelu_param.filler()));
     } else {
       FillerParameter filler_param;
       filler_param.set_type("constant");
       filler_param.set_value(0.25);
-      filler.reset(GetFiller<Dtype>(filler_param));
+      filler.reset(GetFiller < Dtype > (filler_param));
     }
     filler->Fill(this->blobs_[0].get());
   }
@@ -89,8 +89,7 @@ void PReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const Dtype* slope_data = this->blobs_[0]->cpu_data();
   const Dtype* top_diff = top[0]->cpu_diff();
@@ -123,18 +122,88 @@ void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
     for (int i = 0; i < count; ++i) {
       int c = (i / dim) % channels / div_factor;
-      bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
-          + slope_data[c] * (bottom_data[i] <= 0));
+      bottom_diff[i] = top_diff[i]
+          * ((bottom_data[i] > 0) + slope_data[c] * (bottom_data[i] <= 0));
     }
   }
 }
 
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  const int dim = bottom[0]->count(2);
+  const int channels = bottom[0]->channels();
+  const Dtype* slope_data = this->blobs_[0]->gpu_data();
+  const int div_factor = channel_shared_ ? channels : 1;
+
+  if (top[0] == bottom[0]) {
+    caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
+  }
+  PReLUForward(count, channels, dim, bottom_data, top_data, slope_data,
+      div_factor);
+}
+
+template <typename Dtype>
+void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const int count = bottom[0]->count();
+  const int dim = bottom[0]->count(2);
+  const int channels = bottom[0]->channels();
+
+  if (top[0] == bottom[0]) {
+    bottom_data = bottom_memory_.gpu_data();
+  }
+
+  // Propagate to param
+  // Since to write bottom diff will affect top diff if top and bottom blobs
+  // are identical (in-place computaion), we first compute param backward to
+  // keep top_diff unchanged.
+  if (this->param_propagate_down_[0]) {
+    Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
+    int cdim = channels * dim;
+    Dtype dsum = 0.;
+    for (int n = 0; n < bottom[0]->num(); ++n) {
+      // compute element-wise diff
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      PReLUParamBackward(cdim, top_diff, top[0]->offset(n), bottom_data,
+          bottom[0]->offset(n), backward_buff_.mutable_gpu_diff());
+      if (channel_shared_) {
+        Dtype d;
+        caffe_gpu_dot < Dtype
+            > (channels * dim, backward_buff_.gpu_diff(), multiplier_.gpu_data(), &d);
+        dsum += d;
+      } else {
+        caffe_gpu_gemv < Dtype
+            > (CblasNoTrans, channels, dim, 1., backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., slope_diff);
+      }
+    }
+    if (channel_shared_) {
+      caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
+    }
+  }
+  // Propagate to bottom
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const Dtype* slope_data = this->blobs_[0]->gpu_data();
+    int div_factor = channel_shared_ ? channels : 1;
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff,
+        slope_data, div_factor);
+  }
+}
+// end: code modified for OpenCL port
 
-#ifdef CPU_ONLY
+#else
 STUB_GPU(PReLULayer);
 #endif
 
-INSTANTIATE_CLASS(PReLULayer);
-REGISTER_LAYER_CLASS(PReLU);
-
+INSTANTIATE_CLASS (PReLULayer);
+REGISTER_LAYER_CLASS (PReLU);
 }  // namespace caffe
diff --git a/src/caffe/layers/prelu_layer.cu b/src/caffe/layers/prelu_layer.cu
deleted file mode 100644
index e1f20048..00000000
--- a/src/caffe/layers/prelu_layer.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-// CUDA kernele for forward
-template <typename Dtype>
-__global__ void PReLUForward(const int n, const int channels, const int dim,
-    const Dtype* in, Dtype* out, const Dtype* slope_data,
-    const int div_factor) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int c = (index / dim) % channels / div_factor;
-    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
-  }
-}
-
-// CUDA kernel for bottom backward
-template <typename Dtype>
-__global__ void PReLUBackward(const int n, const int channels, const int dim,
-    const Dtype* in_diff, const Dtype* in_data, Dtype* out_diff,
-    const Dtype* slope_data, const int div_factor) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int c = (index / dim) % channels / div_factor;
-    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
-        + (in_data[index] <= 0) * slope_data[c]);
-  }
-}
-
-// CUDA kernel for element-wise parameter backward
-template <typename Dtype>
-__global__ void PReLUParamBackward(const int n, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
-  }
-}
-
-template <typename Dtype>
-void PReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-  const Dtype* slope_data = this->blobs_[0]->gpu_data();
-  const int div_factor = channel_shared_ ? channels : 1;
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data());
-  }
-
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  PReLUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, channels, dim, bottom_data, top_data, slope_data, div_factor);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
-
-  // For in-place computation
-  if (top[0] == bottom[0]) {
-    bottom_data = bottom_memory_.gpu_data();
-  }
-
-  // Propagate to param
-  // Since to write bottom diff will affect top diff if top and bottom blobs
-  // are identical (in-place computaion), we first compute param backward to
-  // keep top_diff unchanged.
-  if (this->param_propagate_down_[0]) {
-    Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff();
-    int cdim = channels * dim;
-    Dtype dsum = 0.;
-    for (int n = 0; n < bottom[0]->num(); ++n) {
-      // compute element-wise diff
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
-          CAFFE_CUDA_NUM_THREADS>>>(
-          cdim, top_diff + top[0]->offset(n),
-          bottom_data + bottom[0]->offset(n),
-          backward_buff_.mutable_gpu_diff());
-      CUDA_POST_KERNEL_CHECK;
-      if (channel_shared_) {
-        Dtype d;
-        caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
-            multiplier_.gpu_data(), &d);
-        dsum += d;
-      } else {
-        caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
-            backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
-            slope_diff);
-      }
-    }
-    if (channel_shared_) {
-      caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff);
-    }
-  }
-  // Propagate to bottom
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const Dtype* slope_data = this->blobs_[0]->gpu_data();
-    int div_factor = channel_shared_ ? channels : 1;
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    PReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
-        CAFFE_CUDA_NUM_THREADS>>>(
-        count, channels, dim, top_diff, bottom_data, bottom_diff, slope_data,
-        div_factor);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(PReLULayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp
index 8ae6329e..ace74b28 100644
--- a/src/caffe/layers/reduction_layer.cpp
+++ b/src/caffe/layers/reduction_layer.cpp
@@ -10,13 +10,13 @@ namespace caffe {
 
 template <typename Dtype>
 void ReductionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   op_ = this->layer_param_.reduction_param().operation();
 }
 
 template <typename Dtype>
 void ReductionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   axis_ = bottom[0]->CanonicalAxisIndex(
       this->layer_param_.reduction_param().axis());
   // In the output, we'll keep all axes up to the reduction axis, but
@@ -24,13 +24,13 @@ void ReductionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   // Note: currently reducing along non-tail axes is not supported; otherwise,
   // we'd need to also copy any axes following an "end_axis".
   vector<int> top_shape(bottom[0]->shape().begin(),
-                        bottom[0]->shape().begin() + axis_);
+      bottom[0]->shape().begin() + axis_);
   top[0]->Reshape(top_shape);
   num_ = bottom[0]->count(0, axis_);
   dim_ = bottom[0]->count(axis_);
   CHECK_EQ(num_, top[0]->count());
-  if (op_ == ReductionParameter_ReductionOp_SUM ||
-      op_ == ReductionParameter_ReductionOp_MEAN) {
+  if (op_ == ReductionParameter_ReductionOp_SUM
+      || op_ == ReductionParameter_ReductionOp_MEAN) {
     vector<int> sum_mult_shape(1, dim_);
     sum_multiplier_.Reshape(sum_mult_shape);
     caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data());
@@ -42,8 +42,8 @@ void ReductionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void ReductionLayer<Dtype>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+void ReductionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const Dtype* mult_data = NULL;
   if (sum_multiplier_.count() > 0) {
@@ -79,7 +79,9 @@ void ReductionLayer<Dtype>::Forward_cpu(
 template <typename Dtype>
 void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
+  if (!propagate_down[0]) {
+    return;
+  }
   // Get bottom_data, if needed.
   const Dtype* bottom_data = NULL;
   switch (op_) {
@@ -87,7 +89,7 @@ void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   case ReductionParameter_ReductionOp_SUM:
   case ReductionParameter_ReductionOp_MEAN:
     break;
-  // Operations that need bottom_data
+    // Operations that need bottom_data
   case ReductionParameter_ReductionOp_ASUM:
   case ReductionParameter_ReductionOp_SUMSQ:
     bottom_data = bottom[0]->cpu_data();
@@ -122,11 +124,102 @@ void ReductionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void ReductionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+ //Forward_cpu(bottom, top);
+//return;
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* mult_data = NULL;
+  if (sum_multiplier_.count() > 0) {
+    mult_data = sum_multiplier_.gpu_data();
+  }
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  size_t bottom_offset = 0;
+  for (int i = 0; i < num_; ++i) {
+    switch (op_) {
+    case ReductionParameter_ReductionOp_SUM:
+    case ReductionParameter_ReductionOp_MEAN:
+    caffe_gpu_dot(dim_, mult_data, 0, bottom_data, bottom_offset, top_data);
+      break;
+    case ReductionParameter_ReductionOp_ASUM:
+      caffe_gpu_asum(dim_, bottom_data, bottom_offset, top_data);
+      break;
+    case ReductionParameter_ReductionOp_SUMSQ:
+      caffe_gpu_dot(dim_, bottom_data, bottom_offset, bottom_data, bottom_offset, top_data);
+      break;
+    default:
+      LOG(FATAL) << "Unknown reduction op: "
+          << ReductionParameter_ReductionOp_Name(op_);
+    }
+    bottom_offset += dim_;
+    ++top_data;
+  }
+  if (coeff_ != Dtype(1)) {
+    // Reset the top_data pointer.
+    top_data = top[0]->mutable_gpu_data();
+    caffe_gpu_scal(num_, coeff_, top_data);
+  }
+}
+
+template <typename Dtype>
+void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  // Get bottom_data, if needed.
+  const Dtype* bottom_data = NULL;
+  switch (op_) {
+  // Operations that don't need bottom_data
+  case ReductionParameter_ReductionOp_SUM:
+  case ReductionParameter_ReductionOp_MEAN:
+    break;
+    // Operations that need bottom_data
+  case ReductionParameter_ReductionOp_ASUM:
+  case ReductionParameter_ReductionOp_SUMSQ:
+    bottom_data = bottom[0]->gpu_data();
+    break;
+  default:
+    LOG(FATAL) << "Unknown reduction op: "
+        << ReductionParameter_ReductionOp_Name(op_);
+  }
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  int bottom_data_offset = 0;
+  int bottom_diff_offset = 0;
+  for (int i = 0; i < num_; ++i) {
+    const Dtype bottom_coeff = (*top_diff) * coeff_;
+    switch (op_) {
+    case ReductionParameter_ReductionOp_SUM:
+    case ReductionParameter_ReductionOp_MEAN:
+      caffe_gpu_set(dim_, bottom_coeff, bottom_diff, bottom_diff_offset);
+      break;
+    case ReductionParameter_ReductionOp_ASUM:
+      caffe_gpu_sign(dim_, bottom_data, bottom_data_offset, bottom_diff, bottom_diff_offset);
+      caffe_gpu_scal(dim_, bottom_coeff, bottom_diff, bottom_diff_offset);
+      break;
+    case ReductionParameter_ReductionOp_SUMSQ:
+      caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_data_offset, bottom_diff, bottom_diff_offset);
+      break;
+    default:
+      LOG(FATAL) << "Unknown reduction op: "
+          << ReductionParameter_ReductionOp_Name(op_);
+    }
+    bottom_data_offset += dim_;
+    bottom_diff_offset += dim_;
+    ++top_diff;
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(ReductionLayer);
 #endif
 
-INSTANTIATE_CLASS(ReductionLayer);
-REGISTER_LAYER_CLASS(Reduction);
+INSTANTIATE_CLASS (ReductionLayer);
+REGISTER_LAYER_CLASS (Reduction);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/reduction_layer.cu b/src/caffe/layers/reduction_layer.cu
deleted file mode 100644
index 2dbd3bc9..00000000
--- a/src/caffe/layers/reduction_layer.cu
+++ /dev/null
@@ -1,93 +0,0 @@
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void ReductionLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const Dtype* mult_data = NULL;
-  if (sum_multiplier_.count() > 0) {
-    mult_data = sum_multiplier_.gpu_data();
-  }
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  for (int i = 0; i < num_; ++i) {
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_dot(dim_, mult_data, bottom_data, top_data);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_asum(dim_, bottom_data, top_data);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    ++top_data;
-  }
-  if (coeff_ != Dtype(1)) {
-    // Reset the top_data pointer.
-    top_data = top[0]->mutable_gpu_data();
-    caffe_gpu_scal(num_, coeff_, top_data);
-  }
-}
-
-template <typename Dtype>
-void ReductionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  // Get bottom_data, if needed.
-  const Dtype* bottom_data = NULL;
-  switch (op_) {
-  // Operations that don't need bottom_data
-  case ReductionParameter_ReductionOp_SUM:
-  case ReductionParameter_ReductionOp_MEAN:
-    break;
-  // Operations that need bottom_data
-  case ReductionParameter_ReductionOp_ASUM:
-  case ReductionParameter_ReductionOp_SUMSQ:
-    bottom_data = bottom[0]->gpu_data();
-    break;
-  default:
-    LOG(FATAL) << "Unknown reduction op: "
-        << ReductionParameter_ReductionOp_Name(op_);
-  }
-  const Dtype* top_diff = top[0]->cpu_diff();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int i = 0; i < num_; ++i) {
-    const Dtype bottom_coeff = (*top_diff) * coeff_;
-    switch (op_) {
-    case ReductionParameter_ReductionOp_SUM:
-    case ReductionParameter_ReductionOp_MEAN:
-      caffe_gpu_set(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_ASUM:
-      caffe_gpu_sign(dim_, bottom_data, bottom_diff);
-      caffe_gpu_scal(dim_, bottom_coeff, bottom_diff);
-      break;
-    case ReductionParameter_ReductionOp_SUMSQ:
-      caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff);
-      break;
-    default:
-      LOG(FATAL) << "Unknown reduction op: "
-          << ReductionParameter_ReductionOp_Name(op_);
-    }
-    bottom_data += dim_;
-    bottom_diff += dim_;
-    ++top_diff;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(ReductionLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp
index cc00319a..3d2eaf2e 100644
--- a/src/caffe/layers/relu_layer.cpp
+++ b/src/caffe/layers/relu_layer.cpp
@@ -5,7 +5,6 @@
 #include "caffe/vision_layers.hpp"
 
 namespace caffe {
-
 template <typename Dtype>
 void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -21,8 +20,7 @@ void ReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[0]) {
     const Dtype* bottom_data = bottom[0]->cpu_data();
     const Dtype* top_diff = top[0]->cpu_diff();
@@ -30,17 +28,42 @@ void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const int count = bottom[0]->count();
     Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
     for (int i = 0; i < count; ++i) {
-      bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
-          + negative_slope * (bottom_data[i] <= 0));
+      bottom_diff[i] = top_diff[i]
+          * ((bottom_data[i] > 0) + negative_slope * (bottom_data[i] <= 0));
     }
   }
 }
 
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+  ReLUForward(count, bottom_data, top_data, negative_slope);
+}
+
+template <typename Dtype>
+void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+    ReLUBackward(count, top_diff, bottom_data, bottom_diff, negative_slope);
+  }
+}
+// end: code modified for OpenCL port
 
-#ifdef CPU_ONLY
+#else 
 STUB_GPU(ReLULayer);
 #endif
 
-INSTANTIATE_CLASS(ReLULayer);
+INSTANTIATE_CLASS (ReLULayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/relu_layer.cu b/src/caffe/layers/relu_layer.cu
deleted file mode 100644
index b8924c85..00000000
--- a/src/caffe/layers/relu_layer.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void ReLUForward(const int n, const Dtype* in, Dtype* out,
-    Dtype negative_slope) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
-  }
-}
-
-template <typename Dtype>
-void ReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  ReLUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data, negative_slope);
-  CUDA_POST_KERNEL_CHECK;
-  // << " count: " << count << " bottom_data: "
-  //     << (unsigned long)bottom_data
-  //     << " top_data: " << (unsigned long)top_data
-  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
-  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
-}
-
-template <typename Dtype>
-__global__ void ReLUBackward(const int n, const Dtype* in_diff,
-    const Dtype* in_data, Dtype* out_diff, Dtype negative_slope) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
-        + (in_data[index] <= 0) * negative_slope);
-  }
-}
-
-template <typename Dtype>
-void ReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    ReLUBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, bottom_data, bottom_diff, negative_slope);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(ReLULayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp
index ffe970f2..a2377d87 100644
--- a/src/caffe/layers/reshape_layer.cpp
+++ b/src/caffe/layers/reshape_layer.cpp
@@ -31,8 +31,9 @@ template <typename Dtype>
 void ReshapeLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const int input_start_axis = this->layer_param_.reshape_param().axis();
-  const int start_axis = (input_start_axis >= 0) ? input_start_axis :
-      bottom[0]->num_axes() + input_start_axis + 1;
+  const int start_axis =
+      (input_start_axis >= 0) ?
+          input_start_axis : bottom[0]->num_axes() + input_start_axis + 1;
   CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range";
   CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis
       << " out of range for " << bottom[0]->num_axes() << "-D input blob";
@@ -63,8 +64,8 @@ void ReshapeLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index)
         << "new shape contains a 0, but there was no corresponding bottom axis "
         << "to copy";
-    top_shape[start_axis + copy_axis_index] =
-        bottom[0]->shape(start_axis + copy_axis_index);
+    top_shape[start_axis + copy_axis_index] = bottom[0]->shape(
+        start_axis + copy_axis_index);
   }
   if (inferred_axis_ >= 0) {
     // A -1 dim was specified; infer the correct dimension by computing the
@@ -89,7 +90,7 @@ void ReshapeLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   top[0]->ShareDiff(*bottom[0]);
 }
 
-INSTANTIATE_CLASS(ReshapeLayer);
-REGISTER_LAYER_CLASS(Reshape);
+INSTANTIATE_CLASS (ReshapeLayer);
+REGISTER_LAYER_CLASS (Reshape);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index cc236fe1..f074ac51 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -11,7 +11,7 @@ namespace caffe {
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::LayerSetUp(bottom, top);
+  LossLayer < Dtype > ::LayerSetUp(bottom, top);
   sigmoid_bottom_vec_.clear();
   sigmoid_bottom_vec_.push_back(bottom[0]);
   sigmoid_top_vec_.clear();
@@ -22,9 +22,9 @@ void SigmoidCrossEntropyLossLayer<Dtype>::LayerSetUp(
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::Reshape(bottom, top);
-  CHECK_EQ(bottom[0]->count(), bottom[1]->count()) <<
-      "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count.";
+  LossLayer < Dtype > ::Reshape(bottom, top);
+  CHECK_EQ(bottom[0]->count(), bottom[1]->count())
+      << "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count.";
   sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
 }
 
@@ -42,8 +42,9 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
   const Dtype* target = bottom[1]->cpu_data();
   Dtype loss = 0;
   for (int i = 0; i < count; ++i) {
-    loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
-        log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
+    loss -= input_data[i] * (target[i] - (input_data[i] >= 0))
+        - log(
+            1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
   }
   top[0]->mutable_cpu_data()[0] = loss / num;
 }
@@ -54,7 +55,7 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
     const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[1]) {
     LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
+        << " Layer cannot backpropagate to label inputs.";
   }
   if (propagate_down[0]) {
     // First, compute the diff
@@ -70,11 +71,35 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+template <typename Dtype>
+void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    // First, compute the diff
+    const int count = bottom[0]->count();
+    const int num = bottom[0]->num();
+    const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
+    const Dtype* target = bottom[1]->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_gpu_copy(count, sigmoid_output_data, bottom_diff);
+    caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
+    // Scale down gradient
+    const Dtype loss_weight = top[0]->cpu_diff()[0];
+    caffe_gpu_scal(count, loss_weight / num, bottom_diff);
+  }
+}
+
+#else
 STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward);
 #endif
 
-INSTANTIATE_CLASS(SigmoidCrossEntropyLossLayer);
-REGISTER_LAYER_CLASS(SigmoidCrossEntropyLoss);
+INSTANTIATE_CLASS (SigmoidCrossEntropyLossLayer);
+REGISTER_LAYER_CLASS (SigmoidCrossEntropyLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
deleted file mode 100644
index 547fa80c..00000000
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
-    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    // First, compute the diff
-    const int count = bottom[0]->count();
-    const int num = bottom[0]->num();
-    const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
-    const Dtype* target = bottom[1]->gpu_data();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_copy(count, sigmoid_output_data, bottom_diff);
-    caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
-    // Scale down gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_gpu_scal(count, loss_weight / num, bottom_diff);
-  }
-}
-
-INSTANTIATE_LAYER_GPU_BACKWARD(SigmoidCrossEntropyLossLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 48c38490..b820e8ff 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -4,6 +4,7 @@
 
 #include "caffe/layer.hpp"
 #include "caffe/vision_layers.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
 
 namespace caffe {
 
@@ -25,8 +26,7 @@ void SigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[0]) {
     const Dtype* top_data = top[0]->cpu_data();
     const Dtype* top_diff = top[0]->cpu_diff();
@@ -39,11 +39,37 @@ void SigmoidLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+
+template <typename Dtype>
+void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SigmoidForward(count, bottom_data, top_data);
+}
+
+template <typename Dtype>
+void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    SigmoidBackward(count, top_diff, top_data, bottom_diff);
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(SigmoidLayer);
 #endif
 
-INSTANTIATE_CLASS(SigmoidLayer);
-
+INSTANTIATE_CLASS (SigmoidLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/sigmoid_layer.cu
deleted file mode 100644
index e1af0657..00000000
--- a/src/caffe/layers/sigmoid_layer.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = 1. / (1. + exp(-in[index]));
-  }
-}
-
-template <typename Dtype>
-void SigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  SigmoidForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-  // << " count: " << count << " bottom_data: "
-  //     << (unsigned long)bottom_data
-  //     << " top_data: " << (unsigned long)top_data
-  //     << " blocks: " << CAFFE_GET_BLOCKS(count)
-  //     << " threads: " << CAFFE_CUDA_NUM_THREADS;
-}
-
-template <typename Dtype>
-__global__ void SigmoidBackward(const int n, const Dtype* in_diff,
-    const Dtype* out_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    const Dtype sigmoid_x = out_data[index];
-    out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
-  }
-}
-
-template <typename Dtype>
-void SigmoidLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    SigmoidBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SigmoidLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp
index 4abf9eff..4436584b 100644
--- a/src/caffe/layers/silence_layer.cpp
+++ b/src/caffe/layers/silence_layer.cpp
@@ -8,20 +8,39 @@ namespace caffe {
 
 template <typename Dtype>
 void SilenceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   for (int i = 0; i < bottom.size(); ++i) {
     if (propagate_down[i]) {
-      caffe_set(bottom[i]->count(), Dtype(0),
-                bottom[i]->mutable_cpu_data());
+      caffe_set(bottom[i]->count(), Dtype(0), bottom[i]->mutable_cpu_data());
     }
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // Do nothing.
+}
+
+template <typename Dtype>
+void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  for (int i = 0; i < bottom.size(); ++i) {
+    if (propagate_down[i]) {
+      caffe_gpu_set(bottom[i]->count(), Dtype(0),
+          bottom[i]->mutable_gpu_data());
+    }
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(SilenceLayer);
 #endif
 
-INSTANTIATE_CLASS(SilenceLayer);
-REGISTER_LAYER_CLASS(Silence);
+INSTANTIATE_CLASS (SilenceLayer);
+REGISTER_LAYER_CLASS (Silence);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/silence_layer.cu b/src/caffe/layers/silence_layer.cu
deleted file mode 100644
index 8d044ee7..00000000
--- a/src/caffe/layers/silence_layer.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <vector>
-
-#include "caffe/common_layers.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SilenceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  // Do nothing.
-}
-
-template <typename Dtype>
-void SilenceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  for (int i = 0; i < bottom.size(); ++i) {
-    if (propagate_down[i]) {
-      caffe_gpu_set(bottom[i]->count(), Dtype(0),
-                    bottom[i]->mutable_gpu_data());
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SilenceLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index e4418c9c..de21e936 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -9,19 +9,18 @@ namespace caffe {
 
 template <typename Dtype>
 void SliceLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const SliceParameter& slice_param = this->layer_param_.slice_param();
   CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim()))
       << "Either axis or slice_dim should be specified; not both.";
   slice_point_.clear();
-  std::copy(slice_param.slice_point().begin(),
-      slice_param.slice_point().end(),
+  std::copy(slice_param.slice_point().begin(), slice_param.slice_point().end(),
       std::back_inserter(slice_point_));
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   const int num_axes = bottom[0]->num_axes();
   const SliceParameter& slice_param = this->layer_param_.slice_param();
   if (slice_param.has_slice_dim()) {
@@ -57,9 +56,9 @@ void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       count += top[i]->count();
     }
   } else {
-    CHECK_EQ(bottom_slice_axis % top.size(), 0)
-        << "Number of top blobs (" << top.size() << ") should evenly "
-        << "divide input slice axis (" << bottom_slice_axis << ")";
+    CHECK_EQ(bottom_slice_axis % top.size(), 0) << "Number of top blobs ("
+        << top.size() << ") should evenly " << "divide input slice axis ("
+        << bottom_slice_axis << ")";
     top_shape[slice_axis_] = bottom_slice_axis / top.size();
     for (int i = 0; i < top.size(); ++i) {
       top[i]->Reshape(top_shape);
@@ -71,7 +70,7 @@ void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   int offset_slice_axis = 0;
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
@@ -80,10 +79,10 @@ void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const int top_slice_axis = top[i]->shape(slice_axis_);
     for (int n = 0; n < num_slices_; ++n) {
       const int top_offset = n * top_slice_axis * slice_size_;
-      const int bottom_offset =
-          (n * bottom_slice_axis + offset_slice_axis) * slice_size_;
-      caffe_copy(top_slice_axis * slice_size_,
-          bottom_data + bottom_offset, top_data + top_offset);
+      const int bottom_offset = (n * bottom_slice_axis + offset_slice_axis)
+          * slice_size_;
+      caffe_copy(top_slice_axis * slice_size_, bottom_data + bottom_offset,
+          top_data + top_offset);
     }
     offset_slice_axis += top_slice_axis;
   }
@@ -91,8 +90,10 @@ void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
   int offset_slice_axis = 0;
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
@@ -101,20 +102,63 @@ void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const int top_slice_axis = top[i]->shape(slice_axis_);
     for (int n = 0; n < num_slices_; ++n) {
       const int top_offset = n * top_slice_axis * slice_size_;
-      const int bottom_offset =
-          (n * bottom_slice_axis + offset_slice_axis) * slice_size_;
-      caffe_copy(top_slice_axis * slice_size_,
-          top_diff + top_offset, bottom_diff + bottom_offset);
+      const int bottom_offset = (n * bottom_slice_axis + offset_slice_axis)
+          * slice_size_;
+      caffe_copy(top_slice_axis * slice_size_, top_diff + top_offset,
+          bottom_diff + bottom_offset);
     }
     offset_slice_axis += top_slice_axis;
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  if (top.size() == 1) { return; }
+  int offset_slice_axis = 0;
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  const bool kForward = true;
+  for (int i = 0; i < top.size(); ++i) {
+    Dtype* top_data = top[i]->mutable_gpu_data();
+    const int top_slice_axis = top[i]->shape(slice_axis_);
+    const int top_slice_size = top_slice_axis * slice_size_;
+    const int nthreads = top_slice_size * num_slices_;
+    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        (nthreads, bottom_data, kForward, num_slices_, slice_size_,
+        bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data);
+    offset_slice_axis += top_slice_axis;
+  }
+}
+
+template <typename Dtype>
+void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0] || top.size() == 1) { return; }
+  int offset_slice_axis = 0;
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
+  const bool kForward = false;
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    const int top_slice_axis = top[i]->shape(slice_axis_);
+    const int top_slice_size = top_slice_axis * slice_size_;
+    const int nthreads = top_slice_size * num_slices_;
+    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        (nthreads, top_diff, kForward, num_slices_, slice_size_,
+        bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff);
+    offset_slice_axis += top_slice_axis;
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(SliceLayer);
 #endif
 
-INSTANTIATE_CLASS(SliceLayer);
-REGISTER_LAYER_CLASS(Slice);
+INSTANTIATE_CLASS (SliceLayer);
+REGISTER_LAYER_CLASS (Slice);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/slice_layer.cu b/src/caffe/layers/slice_layer.cu
deleted file mode 100644
index 796841d3..00000000
--- a/src/caffe/layers/slice_layer.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void Slice(const int nthreads, const Dtype* in_data,
-    const bool forward, const int num_slices, const int slice_size,
-    const int bottom_slice_axis, const int top_slice_axis,
-    const int offset_slice_axis, Dtype* out_data) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int total_slice_size = slice_size * top_slice_axis;
-    const int slice_num = index / total_slice_size;
-    const int slice_index = index % total_slice_size;
-    const int bottom_index = slice_index +
-        (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
-    if (forward) {
-      out_data[index] = in_data[bottom_index];
-    } else {
-      out_data[bottom_index] = in_data[index];
-    }
-  }
-}
-
-template <typename Dtype>
-void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  int offset_slice_axis = 0;
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-  const bool kForward = true;
-  for (int i = 0; i < top.size(); ++i) {
-    Dtype* top_data = top[i]->mutable_gpu_data();
-    const int top_slice_axis = top[i]->shape(slice_axis_);
-    const int top_slice_size = top_slice_axis * slice_size_;
-    const int nthreads = top_slice_size * num_slices_;
-    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, bottom_data, kForward, num_slices_, slice_size_,
-        bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data);
-    offset_slice_axis += top_slice_axis;
-  }
-}
-
-template <typename Dtype>
-void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  int offset_slice_axis = 0;
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
-  const bool kForward = false;
-  for (int i = 0; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    const int top_slice_axis = top[i]->shape(slice_axis_);
-    const int top_slice_size = top_slice_axis * slice_size_;
-    const int nthreads = top_slice_size * num_slices_;
-    Slice<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, top_diff, kForward, num_slices_, slice_size_,
-        bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff);
-    offset_slice_axis += top_slice_axis;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SliceLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp
index 04712c9e..1269b058 100644
--- a/src/caffe/layers/softmax_layer.cpp
+++ b/src/caffe/layers/softmax_layer.cpp
@@ -9,9 +9,9 @@ namespace caffe {
 
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  softmax_axis_ =
-      bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
+    const vector<Blob<Dtype>*>& top) {
+  softmax_axis_ = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.softmax_param().axis());
   top[0]->ReshapeLike(*bottom[0]);
   vector<int> mult_dims(1, bottom[0]->shape(softmax_axis_));
   sum_multiplier_.Reshape(mult_dims);
@@ -24,6 +24,10 @@ void SoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   scale_.Reshape(scale_dims);
 }
 
+template <typename Dtype>
+SoftmaxLayer<Dtype>::~SoftmaxLayer() {
+}
+
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -45,13 +49,13 @@ void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       }
     }
     // subtraction
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, inner_num_,
-        1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data);
     // exponentiation
-    caffe_exp<Dtype>(dim, top_data, top_data);
+    caffe_exp < Dtype > (dim, top_data, top_data);
     // sum after exp
-    caffe_cpu_gemv<Dtype>(CblasTrans, channels, inner_num_, 1.,
-        top_data, sum_multiplier_.cpu_data(), 0., scale_data);
+    caffe_cpu_gemv < Dtype
+        > (CblasTrans, channels, inner_num_, 1., top_data, sum_multiplier_.cpu_data(), 0., scale_data);
     // division
     for (int j = 0; j < channels; j++) {
       caffe_div(inner_num_, top_data, scale_data, top_data);
@@ -62,8 +66,7 @@ void SoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   const Dtype* top_data = top[0]->cpu_data();
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
@@ -74,23 +77,82 @@ void SoftmaxLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   for (int i = 0; i < outer_num_; ++i) {
     // compute dot(top_diff, top_data) and subtract them from the bottom diff
     for (int k = 0; k < inner_num_; ++k) {
-      scale_data[k] = caffe_cpu_strided_dot<Dtype>(channels,
-          bottom_diff + i * dim + k, inner_num_,
-          top_data + i * dim + k, inner_num_);
+      scale_data[k] = caffe_cpu_strided_dot < Dtype
+          > (channels, bottom_diff + i * dim + k, inner_num_, top_data + i * dim
+              + k, inner_num_);
     }
     // subtraction
-    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels, inner_num_, 1,
-        -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + i * dim);
+    caffe_cpu_gemm < Dtype
+        > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff
+            + i * dim);
   }
   // elementwise multiplication
   caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff);
 }
 
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  Dtype* scale_data = scale_.mutable_gpu_data();
+  int count = bottom[0]->count();
+  int channels = top[0]->shape(softmax_axis_);
+
+  caffe_gpu_copy(count, bottom_data, top_data);
+  // We need to subtract the max to avoid numerical issues, compute the exp,
+  // and then normalize.
+  // compute max
+  // NOLINT_NEXT_LINE(whitespace/operators)
+
+  kernel_channel_max < Dtype
+      > (outer_num_, channels, inner_num_, top_data, scale_data);
+  // subtract
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_subtract < Dtype
+      > (count, outer_num_, channels, inner_num_, scale_data, top_data);
+  // exponentiate
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_exp < Dtype > (count, top_data, top_data);
+  // sum after exp
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_sum < Dtype
+      > (outer_num_, channels, inner_num_, top_data, scale_data);
+  // divide
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_div < Dtype
+      > (count, outer_num_, channels, inner_num_, scale_data, top_data);
+}
+
+template <typename Dtype>
+void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* top_diff = top[0]->gpu_diff();
+  const Dtype* top_data = top[0]->gpu_data();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* scale_data = scale_.mutable_gpu_data();
+  int count = top[0]->count();
+  int channels = top[0]->shape(softmax_axis_);
+  caffe_gpu_copy(count, top_diff, bottom_diff);
+  // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
+  // NOLINT_NEXT_LINE(whitespace/operators)
+
+  kernel_channel_dot < Dtype
+      > (outer_num_, channels, inner_num_, top_diff, top_data, scale_data);
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_channel_subtract < Dtype
+      > (count, outer_num_, channels, inner_num_, scale_data, bottom_diff);
+  // elementwise multiplication
+  caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff);
 
-#ifdef CPU_ONLY
+}
+// end: code modified for OpenCL port
+#else
 STUB_GPU(SoftmaxLayer);
 #endif
 
-INSTANTIATE_CLASS(SoftmaxLayer);
+INSTANTIATE_CLASS (SoftmaxLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/softmax_layer.cu b/src/caffe/layers/softmax_layer.cu
deleted file mode 100644
index 1f9c3a41..00000000
--- a/src/caffe/layers/softmax_layer.cu
+++ /dev/null
@@ -1,149 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "thrust/device_vector.h"
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void kernel_channel_max(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype maxval = -FLT_MAX;
-    for (int c = 0; c < channels; ++c) {
-      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
-    }
-    out[index] = maxval;
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_subtract(const int count,
-    const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_max, Dtype* data) {
-  CUDA_KERNEL_LOOP(index, count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] -= channel_max[n * spatial_dim + s];
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_exp(const int count, const Dtype* data, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, count) {
-    out[index] = exp(data[index]);
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_sum(const int num, const int channels,
-    const int spatial_dim, const Dtype* data, Dtype* channel_sum) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype sum = 0;
-    for (int c = 0; c < channels; ++c) {
-      sum += data[(n * channels + c) * spatial_dim + s];
-    }
-    channel_sum[index] = sum;
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_div(const int count,
-    const int num, const int channels,
-    const int spatial_dim, const Dtype* channel_sum, Dtype* data) {
-  CUDA_KERNEL_LOOP(index, count) {
-    int n = index / channels / spatial_dim;
-    int s = index % spatial_dim;
-    data[index] /= channel_sum[n * spatial_dim + s];
-  }
-}
-
-template <typename Dtype>
-__global__ void kernel_channel_dot(const int num, const int channels,
-    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
-    Dtype* channel_dot) {
-  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
-    int n = index / spatial_dim;
-    int s = index % spatial_dim;
-    Dtype dot = 0;
-    for (int c = 0; c < channels; ++c) {
-      dot += (data_1[(n * channels + c) * spatial_dim + s]
-          * data_2[(n * channels + c) * spatial_dim + s]);
-    }
-    channel_dot[index] = dot;
-  }
-}
-
-template <typename Dtype>
-void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  int count = bottom[0]->count();
-  int channels = top[0]->shape(softmax_axis_);
-  caffe_copy(count, bottom_data, top_data);
-  // We need to subtract the max to avoid numerical issues, compute the exp,
-  // and then normalize.
-  // compute max
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
-      scale_data);
-  // subtract
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, top_data);
-  // exponentiate
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, top_data, top_data);
-  // sum after exp
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
-      scale_data);
-  // divide
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, top_data);
-}
-
-template <typename Dtype>
-void SoftmaxLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  const Dtype* top_diff = top[0]->gpu_diff();
-  const Dtype* top_data = top[0]->gpu_data();
-  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  Dtype* scale_data = scale_.mutable_gpu_data();
-  int count = top[0]->count();
-  int channels = top[0]->shape(softmax_axis_);
-  caffe_copy(count, top_diff, bottom_diff);
-  // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_dot<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
-      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_,
-      top_diff, top_data, scale_data);
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
-      scale_data, bottom_diff);
-  // elementwise multiplication
-  caffe_gpu_mul<Dtype>(top[0]->count(), bottom_diff, top_data, bottom_diff);
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
index ba312f67..ef03ec7e 100644
--- a/src/caffe/layers/softmax_loss_layer.cpp
+++ b/src/caffe/layers/softmax_loss_layer.cpp
@@ -10,20 +10,19 @@
 namespace caffe {
 
 template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::LayerSetUp(bottom, top);
+void SoftmaxWithLossLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::LayerSetUp(bottom, top);
   LayerParameter softmax_param(this->layer_param_);
   softmax_param.set_type("Softmax");
-  softmax_layer_ = LayerRegistry<Dtype>::CreateLayer(softmax_param);
+  softmax_layer_ = LayerRegistry < Dtype > ::CreateLayer(softmax_param);
   softmax_bottom_vec_.clear();
   softmax_bottom_vec_.push_back(bottom[0]);
   softmax_top_vec_.clear();
   softmax_top_vec_.push_back(&prob_);
   softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
 
-  has_ignore_label_ =
-    this->layer_param_.loss_param().has_ignore_label();
+  has_ignore_label_ = this->layer_param_.loss_param().has_ignore_label();
   if (has_ignore_label_) {
     ignore_label_ = this->layer_param_.loss_param().ignore_label();
   }
@@ -31,12 +30,16 @@ void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
 }
 
 template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::Reshape(bottom, top);
+SoftmaxWithLossLayer<Dtype>::~SoftmaxWithLossLayer() {
+}
+
+template <typename Dtype>
+void SoftmaxWithLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  LossLayer < Dtype > ::Reshape(bottom, top);
   softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
-  softmax_axis_ =
-      bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
+  softmax_axis_ = bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.softmax_param().axis());
   outer_num_ = bottom[0]->count(0, softmax_axis_);
   inner_num_ = bottom[0]->count(softmax_axis_ + 1);
   CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
@@ -68,8 +71,9 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
       }
       DCHECK_GE(label_value, 0);
       DCHECK_LT(label_value, prob_.shape(softmax_axis_));
-      loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j],
-                           Dtype(FLT_MIN)));
+      loss -= log(
+          std::max(prob_data[i * dim + label_value * inner_num_ + j],
+              Dtype(FLT_MIN)));
       ++count;
     }
   }
@@ -88,7 +92,7 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[1]) {
     LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
+        << " Layer cannot backpropagate to label inputs.";
   }
   if (propagate_down[0]) {
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
@@ -120,11 +124,79 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+// begin: code modified for OpenCL port
+#ifndef CPU_ONLY
+template <typename Dtype>
+void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.gpu_data();
+  const Dtype* label = bottom[1]->gpu_data();
+  const int dim = prob_.count() / outer_num_;
+  const int nthreads = outer_num_ * inner_num_;
+  // Since this memory is not used for anything until it is overwritten
+  // on the backward pass, we use it here to avoid having to allocate new GPU
+  // memory to accumulate intermediate results in the kernel.
+  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
+  // Similarly, this memory is never used elsewhere, and thus we can use it
+  // to avoid having to allocate additional GPU memory.
+  Dtype* counts = prob_.mutable_gpu_diff();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SoftmaxLossForwardGPU < Dtype
+      > (nthreads, prob_data, label, loss_data, outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+  Dtype loss;
+  caffe_gpu_asum(nthreads, loss_data, &loss);
+  if (normalize_) {
+    Dtype count;
+    caffe_gpu_asum(nthreads, counts, &count);
+    loss /= count;
+  } else {
+    loss /= outer_num_;
+  }
+  top[0]->mutable_cpu_data()[0] = loss;
+  if (top.size() == 2) {
+    top[1]->ShareData(prob_);
+  }
+}
+
+template <typename Dtype>
+void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+        << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const Dtype* prob_data = prob_.gpu_data();
+    const Dtype* top_data = top[0]->gpu_data();
+    caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
+    //caffe_gpu_copy(prob_.count(), prob_data, bottom_diff);
+    const Dtype* label = bottom[1]->gpu_data();
+    const int dim = prob_.count() / outer_num_;
+    const int nthreads = outer_num_ * inner_num_;
+    // Since this memory is never used for anything else,
+    // we use to to avoid allocating new GPU memory.
+    Dtype* counts = prob_.mutable_gpu_diff();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    SoftmaxLossBackwardGPU < Dtype
+        > (nthreads, top_data, label, bottom_diff, outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+    const Dtype loss_weight = top[0]->cpu_diff()[0];
+    if (normalize_) {
+      Dtype count;
+      caffe_gpu_asum(nthreads, counts, &count);
+      caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
+    } else {
+      caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
+    }
+  }
+}
+// end: code modified for OpenCL port
+#else
 STUB_GPU(SoftmaxWithLossLayer);
 #endif
 
-INSTANTIATE_CLASS(SoftmaxWithLossLayer);
-REGISTER_LAYER_CLASS(SoftmaxWithLoss);
+INSTANTIATE_CLASS (SoftmaxWithLossLayer);
+REGISTER_LAYER_CLASS (SoftmaxWithLoss);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
deleted file mode 100644
index 7e0f3da4..00000000
--- a/src/caffe/layers/softmax_loss_layer.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-#include <algorithm>
-#include <cfloat>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void SoftmaxLossForwardGPU(const int nthreads,
-          const Dtype* prob_data, const Dtype* label, Dtype* loss,
-          const int num, const int dim, const int spatial_dim,
-          const bool has_ignore_label_, const int ignore_label_,
-          Dtype* counts) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / spatial_dim;
-    const int s = index % spatial_dim;
-    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-    if (has_ignore_label_ && label_value == ignore_label_) {
-      loss[index] = 0;
-      counts[index] = 0;
-    } else {
-      loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
-                      Dtype(FLT_MIN)));
-      counts[index] = 1;
-    }
-  }
-}
-
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
-  const Dtype* prob_data = prob_.gpu_data();
-  const Dtype* label = bottom[1]->gpu_data();
-  const int dim = prob_.count() / outer_num_;
-  const int nthreads = outer_num_ * inner_num_;
-  // Since this memory is not used for anything until it is overwritten
-  // on the backward pass, we use it here to avoid having to allocate new GPU
-  // memory to accumulate intermediate results in the kernel.
-  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
-  // Similarly, this memory is never used elsewhere, and thus we can use it
-  // to avoid having to allocate additional GPU memory.
-  Dtype* counts = prob_.mutable_gpu_diff();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  SoftmaxLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
-      CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, loss_data,
-      outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-  Dtype loss;
-  caffe_gpu_asum(nthreads, loss_data, &loss);
-  if (normalize_) {
-    Dtype count;
-    caffe_gpu_asum(nthreads, counts, &count);
-    loss /= count;
-  } else {
-    loss /= outer_num_;
-  }
-  top[0]->mutable_cpu_data()[0] = loss;
-  if (top.size() == 2) {
-    top[1]->ShareData(prob_);
-  }
-}
-
-template <typename Dtype>
-__global__ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
-          const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
-          const int spatial_dim, const bool has_ignore_label_,
-          const int ignore_label_, Dtype* counts) {
-  const int channels = dim / spatial_dim;
-
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / spatial_dim;
-    const int s = index % spatial_dim;
-    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
-
-    if (has_ignore_label_ && label_value == ignore_label_) {
-      for (int c = 0; c < channels; ++c) {
-        bottom_diff[n * dim + c * spatial_dim + s] = 0;
-      }
-      counts[index] = 0;
-    } else {
-      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
-      counts[index] = 1;
-    }
-  }
-}
-
-template <typename Dtype>
-void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[1]) {
-    LOG(FATAL) << this->type()
-               << " Layer cannot backpropagate to label inputs.";
-  }
-  if (propagate_down[0]) {
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const Dtype* prob_data = prob_.gpu_data();
-    const Dtype* top_data = top[0]->gpu_data();
-    caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
-    const Dtype* label = bottom[1]->gpu_data();
-    const int dim = prob_.count() / outer_num_;
-    const int nthreads = outer_num_ * inner_num_;
-    // Since this memory is never used for anything else,
-    // we use to to avoid allocating new GPU memory.
-    Dtype* counts = prob_.mutable_gpu_diff();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
-        CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
-        outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    if (normalize_) {
-      Dtype count;
-      caffe_gpu_asum(nthreads, counts, &count);
-      caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
-    } else {
-      caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
-    }
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxWithLossLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp
index 272cb59c..e92f7bf2 100644
--- a/src/caffe/layers/split_layer.cpp
+++ b/src/caffe/layers/split_layer.cpp
@@ -8,7 +8,7 @@ namespace caffe {
 
 template <typename Dtype>
 void SplitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   count_ = bottom[0]->count();
   for (int i = 0; i < top.size(); ++i) {
     // Do not allow in-place computation in the SplitLayer.  Instead, share data
@@ -25,7 +25,7 @@ void SplitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   for (int i = 0; i < top.size(); ++i) {
     top[i]->ShareData(*bottom[0]);
   }
@@ -33,14 +33,16 @@ void SplitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
   if (top.size() == 1) {
     caffe_copy(count_, top[0]->cpu_diff(), bottom[0]->mutable_cpu_diff());
     return;
   }
   caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(),
-            bottom[0]->mutable_cpu_diff());
+      bottom[0]->mutable_cpu_diff());
   // Add remaining top blob diffs.
   for (int i = 2; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->cpu_diff();
@@ -49,12 +51,41 @@ void SplitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
+#ifndef  CPU_ONLY
+template <typename Dtype>
+void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  for (int i = 0; i < top.size(); ++i) {
+    top[i]->ShareData(*bottom[0]);
+  }
+}
 
-#ifdef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  if (top.size() == 1) {
+    caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
+    return;
+  }
+  caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
+      bottom[0]->mutable_gpu_diff());
+  // Add remaining top blob diffs.
+  for (int i = 2; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
+  }
+}
+// begin: code modified for OpenCL port
+#else
 STUB_GPU(SplitLayer);
 #endif
 
-INSTANTIATE_CLASS(SplitLayer);
-REGISTER_LAYER_CLASS(Split);
+INSTANTIATE_CLASS (SplitLayer);
+REGISTER_LAYER_CLASS (Split);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/split_layer.cu b/src/caffe/layers/split_layer.cu
deleted file mode 100644
index a4f5df26..00000000
--- a/src/caffe/layers/split_layer.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  for (int i = 0; i < top.size(); ++i) {
-    top[i]->ShareData(*bottom[0]);
-  }
-}
-
-template <typename Dtype>
-void SplitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
-  if (top.size() == 1) {
-    caffe_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff());
-    return;
-  }
-  caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(),
-                bottom[0]->mutable_gpu_diff());
-  // Add remaining top blob diffs.
-  for (int i = 2; i < top.size(); ++i) {
-    const Dtype* top_diff = top[i]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff);
-  }
-}
-
-
-INSTANTIATE_LAYER_GPU_FUNCS(SplitLayer);
-
-}  // namespace caffe
diff --git a/src/caffe/layers/spp_layer.cpp b/src/caffe/layers/spp_layer.cpp
index 795dd716..d552af61 100644
--- a/src/caffe/layers/spp_layer.cpp
+++ b/src/caffe/layers/spp_layer.cpp
@@ -15,7 +15,7 @@ using std::max;
 
 template <typename Dtype>
 LayerParameter SPPLayer<Dtype>::GetPoolingParam(const int pyramid_level,
-      const int bottom_h, const int bottom_w, const SPPParameter spp_param) {
+    const int bottom_h, const int bottom_w, const SPPParameter spp_param) {
   LayerParameter pooling_param;
   int num_bins = pow(2, pyramid_level);
 
@@ -63,7 +63,7 @@ LayerParameter SPPLayer<Dtype>::GetPoolingParam(const int pyramid_level,
 
 template <typename Dtype>
 void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   SPPParameter spp_param = this->layer_param_.spp_param();
 
   bottom_h_ = bottom[0]->height();
@@ -103,11 +103,12 @@ void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     pooling_top_vecs_[i]->push_back(pooling_outputs_[i]);
 
     // pooling layer setup
-    LayerParameter pooling_param = GetPoolingParam(
-        i, bottom_h_, bottom_w_, spp_param);
+    LayerParameter pooling_param = GetPoolingParam(i, bottom_h_, bottom_w_,
+        spp_param);
 
-    pooling_layers_.push_back(shared_ptr<PoolingLayer<Dtype> > (
-        new PoolingLayer<Dtype>(pooling_param)));
+    pooling_layers_.push_back(
+        shared_ptr < PoolingLayer<Dtype>
+            > (new PoolingLayer<Dtype>(pooling_param)));
     pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
 
     // flatten layer output holders setup
@@ -132,7 +133,7 @@ void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void SPPLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
       << "corresponding to (num, channels, height, width)";
   channels_ = bottom[0]->channels();
@@ -141,53 +142,48 @@ void SPPLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   SPPParameter spp_param = this->layer_param_.spp_param();
   split_layer_->Reshape(bottom, split_top_vec_);
   for (int i = 0; i < pyramid_height_; i++) {
-    LayerParameter pooling_param = GetPoolingParam(
-        i, bottom_h_, bottom_w_, spp_param);
-
-    pooling_layers_[i].reset(
-        new PoolingLayer<Dtype>(pooling_param));
-    pooling_layers_[i]->SetUp(
-        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
-    pooling_layers_[i]->Reshape(
-        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
-    flatten_layers_[i]->Reshape(
-        *pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+    LayerParameter pooling_param = GetPoolingParam(i, bottom_h_, bottom_w_,
+        spp_param);
+
+    pooling_layers_[i].reset(new PoolingLayer<Dtype>(pooling_param));
+    pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+    pooling_layers_[i]->Reshape(*pooling_bottom_vecs_[i],
+        *pooling_top_vecs_[i]);
+    flatten_layers_[i]->Reshape(*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
   }
   concat_layer_->Reshape(concat_bottom_vec_, top);
 }
 
 template <typename Dtype>
 void SPPLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   split_layer_->Forward(bottom, split_top_vec_);
   for (int i = 0; i < pyramid_height_; i++) {
-    pooling_layers_[i]->Forward(
-        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
-    flatten_layers_[i]->Forward(
-        *pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+    pooling_layers_[i]->Forward(*pooling_bottom_vecs_[i],
+        *pooling_top_vecs_[i]);
+    flatten_layers_[i]->Forward(*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
   }
   concat_layer_->Forward(concat_bottom_vec_, top);
 }
 
 template <typename Dtype>
 void SPPLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (!propagate_down[0]) {
     return;
   }
   vector<bool> concat_propagate_down(pyramid_height_, true);
   concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_);
   for (int i = 0; i < pyramid_height_; i++) {
-    flatten_layers_[i]->Backward(
-        *flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]);
-    pooling_layers_[i]->Backward(
-        *pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]);
+    flatten_layers_[i]->Backward(*flatten_top_vecs_[i], propagate_down,
+        *pooling_top_vecs_[i]);
+    pooling_layers_[i]->Backward(*pooling_top_vecs_[i], propagate_down,
+        *pooling_bottom_vecs_[i]);
   }
   split_layer_->Backward(split_top_vec_, propagate_down, bottom);
 }
 
-
-INSTANTIATE_CLASS(SPPLayer);
-REGISTER_LAYER_CLASS(SPP);
+INSTANTIATE_CLASS (SPPLayer);
+REGISTER_LAYER_CLASS (SPP);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp
index ee5ed773..f62092b2 100644
--- a/src/caffe/layers/tanh_layer.cpp
+++ b/src/caffe/layers/tanh_layer.cpp
@@ -6,6 +6,7 @@
 
 #include "caffe/layer.hpp"
 #include "caffe/vision_layers.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
 
 namespace caffe {
 
@@ -22,8 +23,7 @@ void TanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   if (propagate_down[0]) {
     const Dtype* top_data = top[0]->cpu_data();
     const Dtype* top_diff = top[0]->cpu_diff();
@@ -37,10 +37,36 @@ void TanHLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  TanHForward(count, bottom_data, top_data);
+}
+
+template <typename Dtype>
+void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    TanHBackward(count, top_diff, top_data, bottom_diff);
+  }
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU(TanHLayer);
 #endif
 
-INSTANTIATE_CLASS(TanHLayer);
+INSTANTIATE_CLASS (TanHLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/tanh_layer.cu b/src/caffe/layers/tanh_layer.cu
deleted file mode 100644
index ccd6e63e..00000000
--- a/src/caffe/layers/tanh_layer.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-// TanH neuron activation function layer.
-// Adapted from ReLU layer code written by Yangqing Jia
-
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void TanHForward(const int n, const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = tanh(in[index]);
-  }
-}
-
-template <typename Dtype>
-void TanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  TanHForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-template <typename Dtype>
-__global__ void TanHBackward(const int n, const Dtype* in_diff,
-    const Dtype* out_data, Dtype* out_diff) {
-  CUDA_KERNEL_LOOP(index, n) {
-    Dtype tanhx = out_data[index];
-    out_diff[index] = in_diff[index] * (1 - tanhx * tanhx);
-  }
-}
-
-template <typename Dtype>
-void TanHLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down,
-    const vector<Blob<Dtype>*>& bottom) {
-  if (propagate_down[0]) {
-    const Dtype* top_data = top[0]->gpu_data();
-    const Dtype* top_diff = top[0]->gpu_diff();
-    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-    const int count = bottom[0]->count();
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    TanHBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-        count, top_diff, top_data, bottom_diff);
-    CUDA_POST_KERNEL_CHECK;
-  }
-}
-
-INSTANTIATE_LAYER_GPU_FUNCS(TanHLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp
index 2365e7b9..eebc379a 100644
--- a/src/caffe/layers/threshold_layer.cpp
+++ b/src/caffe/layers/threshold_layer.cpp
@@ -2,14 +2,14 @@
 
 #include "caffe/layer.hpp"
 #include "caffe/vision_layers.hpp"
-
+#include "caffe/util/ocl_wrapper.hpp"
 
 namespace caffe {
 
 template <typename Dtype>
 void ThresholdLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
+    const vector<Blob<Dtype>*>& top) {
+  NeuronLayer < Dtype > ::LayerSetUp(bottom, top);
   threshold_ = this->layer_param_.threshold_param().threshold();
 }
 
@@ -24,11 +24,24 @@ void ThresholdLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
-#ifdef CPU_ONLY
+#ifndef CPU_ONLY
+// begin: code modified for OpenCL port
+template <typename Dtype>
+void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ThresholdForward(count, threshold_, bottom_data, top_data);
+}
+// end: code modified for OpenCL port
+
+#else
 STUB_GPU_FORWARD(ThresholdLayer, Forward);
 #endif
 
-INSTANTIATE_CLASS(ThresholdLayer);
-REGISTER_LAYER_CLASS(Threshold);
+INSTANTIATE_CLASS (ThresholdLayer);
+REGISTER_LAYER_CLASS (Threshold);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/threshold_layer.cu b/src/caffe/layers/threshold_layer.cu
deleted file mode 100644
index bfa7f159..00000000
--- a/src/caffe/layers/threshold_layer.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <algorithm>
-#include <vector>
-
-#include "caffe/layer.hpp"
-#include "caffe/vision_layers.hpp"
-
-namespace caffe {
-
-template <typename Dtype>
-__global__ void ThresholdForward(const int n, const Dtype threshold,
-    const Dtype* in, Dtype* out) {
-  CUDA_KERNEL_LOOP(index, n) {
-    out[index] = in[index] > threshold ? 1 : 0;
-  }
-}
-
-template <typename Dtype>
-void ThresholdLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const int count = bottom[0]->count();
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  ThresholdForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, threshold_, bottom_data, top_data);
-  CUDA_POST_KERNEL_CHECK;
-}
-
-
-INSTANTIATE_LAYER_GPU_FORWARD(ThresholdLayer);
-
-
-}  // namespace caffe
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index c127d56b..7085ac63 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -32,7 +32,7 @@ WindowDataLayer<Dtype>::~WindowDataLayer<Dtype>() {
 
 template <typename Dtype>
 void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
   // LayerSetUp runs through the window_file and creates two structures
   // that hold windows: one for foreground (object) windows and one
   // for background (non-object) windows. We use an overlap threshold
@@ -63,9 +63,8 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   cache_images_ = this->layer_param_.window_data_param().cache_images();
   string root_folder = this->layer_param_.window_data_param().root_folder();
 
-  const bool prefetch_needs_rand =
-      this->transform_param_.mirror() ||
-      this->transform_param_.crop_size();
+  const bool prefetch_needs_rand = this->transform_param_.mirror()
+      || this->transform_param_.crop_size();
   if (prefetch_needs_rand) {
     const unsigned int prefetch_rng_seed = caffe_rng_rand();
     prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
@@ -143,21 +142,18 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
     }
 
     if (image_index % 100 == 0) {
-      LOG(INFO) << "num: " << image_index << " "
-          << image_path << " "
-          << image_size[0] << " "
-          << image_size[1] << " "
-          << image_size[2] << " "
-          << "windows to process: " << num_windows;
+      LOG(INFO) << "num: " << image_index << " " << image_path << " "
+          << image_size[0] << " " << image_size[1] << " " << image_size[2]
+          << " " << "windows to process: " << num_windows;
     }
   } while (infile >> hashtag >> image_index);
 
-  LOG(INFO) << "Number of images: " << image_index+1;
+  LOG(INFO) << "Number of images: " << image_index + 1;
 
-  for (map<int, int>::iterator it = label_hist.begin();
-      it != label_hist.end(); ++it) {
+  for (map<int, int>::iterator it = label_hist.begin(); it != label_hist.end();
+      ++it) {
     LOG(INFO) << "class " << it->first << " has " << label_hist[it->first]
-              << " samples";
+        << " samples";
   }
 
   LOG(INFO) << "Amount of context padding: "
@@ -185,21 +181,20 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   has_mean_file_ = this->transform_param_.has_mean_file();
   has_mean_values_ = this->transform_param_.mean_value_size() > 0;
   if (has_mean_file_) {
-    const string& mean_file =
-          this->transform_param_.mean_file();
+    const string& mean_file = this->transform_param_.mean_file();
     LOG(INFO) << "Loading mean file from: " << mean_file;
     BlobProto blob_proto;
     ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
     data_mean_.FromProto(blob_proto);
   }
   if (has_mean_values_) {
-    CHECK(has_mean_file_ == false) <<
-      "Cannot specify mean_file and mean_value at the same time";
+    CHECK(has_mean_file_ == false)
+        << "Cannot specify mean_file and mean_value at the same time";
     for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) {
       mean_values_.push_back(this->transform_param_.mean_value(c));
     }
-    CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) <<
-     "Specify either 1 mean_value or as many as channels: " << channels;
+    CHECK(mean_values_.size() == 1 || mean_values_.size() == channels)
+        << "Specify either 1 mean_value or as many as channels: " << channels;
     if (channels > 1 && mean_values_.size() == 1) {
       // Replicate the mean_value for simplicity
       for (int c = 1; c < channels; ++c) {
@@ -211,7 +206,7 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 template <typename Dtype>
 unsigned int WindowDataLayer<Dtype>::PrefetchRand() {
-  CHECK(prefetch_rng_);
+  CHECK (prefetch_rng_);
   caffe::rng_t* prefetch_rng =
       static_cast<caffe::rng_t*>(prefetch_rng_->generator());
   return (*prefetch_rng)();
@@ -265,20 +260,21 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
       // sample a window
       timer.Start();
       const unsigned int rand_index = PrefetchRand();
-      vector<float> window = (is_fg) ?
-          fg_windows_[rand_index % fg_windows_.size()] :
-          bg_windows_[rand_index % bg_windows_.size()];
+      vector<float> window =
+          (is_fg) ?
+              fg_windows_[rand_index % fg_windows_.size()] :
+              bg_windows_[rand_index % bg_windows_.size()];
 
       bool do_mirror = mirror && PrefetchRand() % 2;
 
       // load the image containing the window
       pair<std::string, vector<int> > image =
-          image_database_[window[WindowDataLayer<Dtype>::IMAGE_INDEX]];
+          image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
 
       cv::Mat cv_img;
       if (this->cache_images_) {
-        pair<std::string, Datum> image_cached =
-          image_database_cache_[window[WindowDataLayer<Dtype>::IMAGE_INDEX]];
+        pair < std::string, Datum > image_cached =
+            image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]];
         cv_img = DecodeDatumToCVMat(image_cached.second, true);
       } else {
         cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR);
@@ -292,10 +288,10 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
       const int channels = cv_img.channels();
 
       // crop window out of image and warp it
-      int x1 = window[WindowDataLayer<Dtype>::X1];
-      int y1 = window[WindowDataLayer<Dtype>::Y1];
-      int x2 = window[WindowDataLayer<Dtype>::X2];
-      int y2 = window[WindowDataLayer<Dtype>::Y2];
+      int x1 = window[WindowDataLayer < Dtype > ::X1];
+      int y1 = window[WindowDataLayer < Dtype > ::Y1];
+      int x2 = window[WindowDataLayer < Dtype > ::X2];
+      int y2 = window[WindowDataLayer < Dtype > ::Y2];
 
       int pad_w = 0;
       int pad_h = 0;
@@ -303,12 +299,12 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
         // scale factor by which to expand the original region
         // such that after warping the expanded region to crop_size x crop_size
         // there's exactly context_pad amount of padding on each side
-        Dtype context_scale = static_cast<Dtype>(crop_size) /
-            static_cast<Dtype>(crop_size - 2*context_pad);
+        Dtype context_scale = static_cast<Dtype>(crop_size)
+            / static_cast<Dtype>(crop_size - 2 * context_pad);
 
         // compute the expanded region
-        Dtype half_height = static_cast<Dtype>(y2-y1+1)/2.0;
-        Dtype half_width = static_cast<Dtype>(x2-x1+1)/2.0;
+        Dtype half_height = static_cast<Dtype>(y2 - y1 + 1) / 2.0;
+        Dtype half_width = static_cast<Dtype>(x2 - x1 + 1) / 2.0;
         Dtype center_x = static_cast<Dtype>(x1) + half_width;
         Dtype center_y = static_cast<Dtype>(y1) + half_height;
         if (use_square) {
@@ -318,16 +314,16 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
             half_height = half_width;
           }
         }
-        x1 = static_cast<int>(round(center_x - half_width*context_scale));
-        x2 = static_cast<int>(round(center_x + half_width*context_scale));
-        y1 = static_cast<int>(round(center_y - half_height*context_scale));
-        y2 = static_cast<int>(round(center_y + half_height*context_scale));
+        x1 = static_cast<int>(round(center_x - half_width * context_scale));
+        x2 = static_cast<int>(round(center_x + half_width * context_scale));
+        y1 = static_cast<int>(round(center_y - half_height * context_scale));
+        y2 = static_cast<int>(round(center_y + half_height * context_scale));
 
         // the expanded region may go outside of the image
         // so we compute the clipped (expanded) region and keep track of
         // the extent beyond the image
-        int unclipped_height = y2-y1+1;
-        int unclipped_width = x2-x1+1;
+        int unclipped_height = y2 - y1 + 1;
+        int unclipped_width = x2 - x1 + 1;
         int pad_x1 = std::max(0, -x1);
         int pad_y1 = std::max(0, -y1);
         int pad_x2 = std::max(0, x2 - cv_img.cols + 1);
@@ -342,25 +338,25 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
         CHECK_LT(x2, cv_img.cols);
         CHECK_LT(y2, cv_img.rows);
 
-        int clipped_height = y2-y1+1;
-        int clipped_width = x2-x1+1;
+        int clipped_height = y2 - y1 + 1;
+        int clipped_width = x2 - x1 + 1;
 
         // scale factors that would be used to warp the unclipped
         // expanded region
-        Dtype scale_x =
-            static_cast<Dtype>(crop_size)/static_cast<Dtype>(unclipped_width);
-        Dtype scale_y =
-            static_cast<Dtype>(crop_size)/static_cast<Dtype>(unclipped_height);
+        Dtype scale_x = static_cast<Dtype>(crop_size)
+            / static_cast<Dtype>(unclipped_width);
+        Dtype scale_y = static_cast<Dtype>(crop_size)
+            / static_cast<Dtype>(unclipped_height);
 
         // size to warp the clipped expanded region to
-        cv_crop_size.width =
-            static_cast<int>(round(static_cast<Dtype>(clipped_width)*scale_x));
-        cv_crop_size.height =
-            static_cast<int>(round(static_cast<Dtype>(clipped_height)*scale_y));
-        pad_x1 = static_cast<int>(round(static_cast<Dtype>(pad_x1)*scale_x));
-        pad_x2 = static_cast<int>(round(static_cast<Dtype>(pad_x2)*scale_x));
-        pad_y1 = static_cast<int>(round(static_cast<Dtype>(pad_y1)*scale_y));
-        pad_y2 = static_cast<int>(round(static_cast<Dtype>(pad_y2)*scale_y));
+        cv_crop_size.width = static_cast<int>(round(
+            static_cast<Dtype>(clipped_width) * scale_x));
+        cv_crop_size.height = static_cast<int>(round(
+            static_cast<Dtype>(clipped_height) * scale_y));
+        pad_x1 = static_cast<int>(round(static_cast<Dtype>(pad_x1) * scale_x));
+        pad_x2 = static_cast<int>(round(static_cast<Dtype>(pad_x2) * scale_x));
+        pad_y1 = static_cast<int>(round(static_cast<Dtype>(pad_y1) * scale_y));
+        pad_y2 = static_cast<int>(round(static_cast<Dtype>(pad_y2) * scale_y));
 
         pad_h = pad_y1;
         // if we're mirroring, we mirror the padding too (to be pedantic)
@@ -380,10 +376,10 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
         }
       }
 
-      cv::Rect roi(x1, y1, x2-x1+1, y2-y1+1);
+      cv::Rect roi(x1, y1, x2 - x1 + 1, y2 - y1 + 1);
       cv::Mat cv_cropped_img = cv_img(roi);
-      cv::resize(cv_cropped_img, cv_cropped_img,
-          cv_crop_size, 0, 0, cv::INTER_LINEAR);
+      cv::resize(cv_cropped_img, cv_cropped_img, cv_crop_size, 0, 0,
+          cv::INTER_LINEAR);
 
       // horizontal flip at random
       if (do_mirror) {
@@ -392,17 +388,17 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
 
       // copy the warped window into top_data
       for (int h = 0; h < cv_cropped_img.rows; ++h) {
-        const uchar* ptr = cv_cropped_img.ptr<uchar>(h);
+        const uchar* ptr = cv_cropped_img.ptr < uchar > (h);
         int img_index = 0;
         for (int w = 0; w < cv_cropped_img.cols; ++w) {
           for (int c = 0; c < channels; ++c) {
             int top_index = ((item_id * channels + c) * crop_size + h + pad_h)
-                     * crop_size + w + pad_w;
+                * crop_size + w + pad_w;
             // int top_index = (c * height + h) * width + w;
             Dtype pixel = static_cast<Dtype>(ptr[img_index++]);
             if (this->has_mean_file_) {
               int mean_index = (c * mean_height + h + mean_off + pad_h)
-                           * mean_width + w + mean_off + pad_w;
+                  * mean_width + w + mean_off + pad_w;
               top_data[top_index] = (pixel - mean[mean_index]) * scale;
             } else {
               if (this->has_mean_values_) {
@@ -416,40 +412,7 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
       }
       trans_time += timer.MicroSeconds();
       // get window label
-      top_label[item_id] = window[WindowDataLayer<Dtype>::LABEL];
-
-      #if 0
-      // useful debugging code for dumping transformed windows to disk
-      string file_id;
-      std::stringstream ss;
-      ss << PrefetchRand();
-      ss >> file_id;
-      std::ofstream inf((string("dump/") + file_id +
-          string("_info.txt")).c_str(), std::ofstream::out);
-      inf << image.first << std::endl
-          << window[WindowDataLayer<Dtype>::X1]+1 << std::endl
-          << window[WindowDataLayer<Dtype>::Y1]+1 << std::endl
-          << window[WindowDataLayer<Dtype>::X2]+1 << std::endl
-          << window[WindowDataLayer<Dtype>::Y2]+1 << std::endl
-          << do_mirror << std::endl
-          << top_label[item_id] << std::endl
-          << is_fg << std::endl;
-      inf.close();
-      std::ofstream top_data_file((string("dump/") + file_id +
-          string("_data.txt")).c_str(),
-          std::ofstream::out | std::ofstream::binary);
-      for (int c = 0; c < channels; ++c) {
-        for (int h = 0; h < crop_size; ++h) {
-          for (int w = 0; w < crop_size; ++w) {
-            top_data_file.write(reinterpret_cast<char*>(
-                &top_data[((item_id * channels + c) * crop_size + h)
-                          * crop_size + w]),
-                sizeof(Dtype));
-          }
-        }
-      }
-      top_data_file.close();
-      #endif
+      top_label[item_id] = window[WindowDataLayer < Dtype > ::LABEL];
 
       item_id++;
     }
@@ -460,7 +423,7 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
   DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
 }
 
-INSTANTIATE_CLASS(WindowDataLayer);
-REGISTER_LAYER_CLASS(WindowData);
+INSTANTIATE_CLASS (WindowDataLayer);
+REGISTER_LAYER_CLASS (WindowData);
 
 }  // namespace caffe
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index a18ee638..711ec408 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -13,6 +13,7 @@
 #include "caffe/util/io.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/upgrade_proto.hpp"
+#include "caffe/util/benchmark.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 
@@ -40,14 +41,14 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   NetParameter filtered_param;
   FilterNet(in_param, &filtered_param);
   LOG(INFO) << "Initializing net from parameters: " << std::endl
-            << filtered_param.DebugString();
+      << filtered_param.DebugString();
   // Create a copy of filtered_param with splits added where necessary.
   NetParameter param;
   InsertSplits(filtered_param, &param);
   // Basically, build all the layers and set up their connections.
   name_ = param.name();
   map<string, int> blob_name_to_idx;
-  set<string> available_blobs;
+  set < string > available_blobs;
   CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0)
       << "Must specify either input_shape OR deprecated input_dim, not both.";
   if (param.input_dim_size() > 0) {
@@ -80,21 +81,20 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     // Setup layer.
     const LayerParameter& layer_param = param.layer(layer_id);
     if (layer_param.propagate_down_size() > 0) {
-      CHECK_EQ(layer_param.propagate_down_size(),
-          layer_param.bottom_size())
+      CHECK_EQ(layer_param.propagate_down_size(), layer_param.bottom_size())
           << "propagate_down param must be specified "
           << "either 0 or bottom_size times ";
     }
-    layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
+    layers_.push_back(LayerRegistry < Dtype > ::CreateLayer(layer_param));
     layer_names_.push_back(layer_param.name());
     LOG(INFO) << "Creating Layer " << layer_param.name();
     bool need_backward = false;
 
     // Figure out this layer's input and output
     for (int bottom_id = 0; bottom_id < layer_param.bottom_size();
-         ++bottom_id) {
+        ++bottom_id) {
       const int blob_id = AppendBottom(param, layer_id, bottom_id,
-                                       &available_blobs, &blob_name_to_idx);
+          &available_blobs, &blob_name_to_idx);
       // If a blob needs backward, this layer should provide it.
       need_backward |= blob_need_backward_[blob_id];
     }
@@ -105,10 +105,10 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter
     // specified fewer than the required number (as specified by
     // ExactNumTopBlobs() or MinTopBlobs()), allocate them here.
-    Layer<Dtype>* layer = layers_[layer_id].get();
+    Layer < Dtype > *layer = layers_[layer_id].get();
     if (layer->AutoTopBlobs()) {
-      const int needed_num_top =
-          std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs());
+      const int needed_num_top = std::max(layer->MinTopBlobs(),
+          layer->ExactNumTopBlobs());
       for (; num_top < needed_num_top; ++num_top) {
         // Add "anonymous" top blobs -- do not modify available_blobs or
         // blob_name_to_idx as we don't want these blobs to be usable as input
@@ -137,12 +137,13 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
         << "Too many params specified for layer " << layer_param.name();
     ParamSpec default_param_spec;
     for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
-      const ParamSpec* param_spec = (param_id < param_size) ?
-          &layer_param.param(param_id) : &default_param_spec;
+      const ParamSpec* param_spec =
+          (param_id < param_size) ?
+              &layer_param.param(param_id) : &default_param_spec;
       const bool param_need_backward = param_spec->lr_mult() > 0;
       need_backward |= param_need_backward;
       layers_[layer_id]->set_param_propagate_down(param_id,
-                                                  param_need_backward);
+          param_need_backward);
     }
     for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
       AppendParam(param, layer_id, param_id);
@@ -161,15 +162,15 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   // Also checks if all bottom blobs don't need backward computation (possible
   // because the skip_propagate_down param) and so we can skip bacward
   // computation for the entire layer
-  set<string> blobs_under_loss;
-  set<string> blobs_skip_backp;
+  set < string > blobs_under_loss;
+  set < string > blobs_skip_backp;
   for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) {
     bool layer_contributes_loss = false;
     bool layer_skip_propagate_down = true;
     for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
       const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
-      if (layers_[layer_id]->loss(top_id) ||
-          (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) {
+      if (layers_[layer_id]->loss(top_id)
+          || (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) {
         layer_contributes_loss = true;
       }
       if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) {
@@ -183,19 +184,21 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     if (layer_need_backward_[layer_id] && layer_skip_propagate_down) {
       layer_need_backward_[layer_id] = false;
       for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
-               ++bottom_id) {
+          ++bottom_id) {
         bottom_need_backward_[layer_id][bottom_id] = false;
       }
     }
-    if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; }
+    if (!layer_contributes_loss) {
+      layer_need_backward_[layer_id] = false;
+    }
     if (layer_need_backward_[layer_id]) {
       LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
     } else {
       LOG(INFO) << layer_names_[layer_id]
-                << " does not need backward computation.";
+          << " does not need backward computation.";
     }
     for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
-         ++bottom_id) {
+        ++bottom_id) {
       if (layer_contributes_loss) {
         const string& blob_name =
             blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
@@ -205,7 +208,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
       }
       if (!bottom_need_backward_[layer_id][bottom_id]) {
         const string& blob_name =
-                   blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+            blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
         blobs_skip_backp.insert(blob_name);
       }
     }
@@ -215,16 +218,16 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) {
       layer_need_backward_[layer_id] = true;
       for (int bottom_id = 0;
-           bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) {
+          bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) {
         bottom_need_backward_[layer_id][bottom_id] =
-            bottom_need_backward_[layer_id][bottom_id] ||
-            layers_[layer_id]->AllowForceBackward(bottom_id);
+            bottom_need_backward_[layer_id][bottom_id]
+                || layers_[layer_id]->AllowForceBackward(bottom_id);
         blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] =
-            blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] ||
-            bottom_need_backward_[layer_id][bottom_id];
+            blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]]
+                || bottom_need_backward_[layer_id][bottom_id];
       }
       for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-           ++param_id) {
+          ++param_id) {
         layers_[layer_id]->set_param_propagate_down(param_id, true);
       }
     }
@@ -258,7 +261,7 @@ void Net<Dtype>::FilterNet(const NetParameter& param,
     const LayerParameter& layer_param = param.layer(i);
     const string& layer_name = layer_param.name();
     CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0)
-          << "Specify either include rules or exclude rules; not both.";
+        << "Specify either include rules or exclude rules; not both.";
     // If no include rules are specified, the layer is included by default and
     // only excluded if it meets one of the exclude rules.
     bool layer_included = (layer_param.include_size() == 0);
@@ -279,16 +282,16 @@ void Net<Dtype>::FilterNet(const NetParameter& param,
 }
 
 template <typename Dtype>
-bool Net<Dtype>::StateMeetsRule(const NetState& state,
-    const NetStateRule& rule, const string& layer_name) {
+bool Net<Dtype>::StateMeetsRule(const NetState& state, const NetStateRule& rule,
+    const string& layer_name) {
   // Check whether the rule is broken due to phase.
   if (rule.has_phase()) {
-      if (rule.phase() != state.phase()) {
-        LOG(INFO) << "The NetState phase (" << state.phase()
+    if (rule.phase() != state.phase()) {
+      LOG(INFO) << "The NetState phase (" << state.phase()
           << ") differed from the phase (" << rule.phase()
           << ") specified by a rule in layer " << layer_name;
-        return false;
-      }
+      return false;
+    }
   }
   // Check whether the rule is broken due to min level.
   if (rule.has_min_level()) {
@@ -314,11 +317,13 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
     // Check that the NetState contains the rule's ith stage.
     bool has_stage = false;
     for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
-      if (rule.stage(i) == state.stage(j)) { has_stage = true; }
+      if (rule.stage(i) == state.stage(j)) {
+        has_stage = true;
+      }
     }
     if (!has_stage) {
       LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
-                << "' specified by a rule in layer " << layer_name;
+          << "' specified by a rule in layer " << layer_name;
       return false;
     }
   }
@@ -328,11 +333,13 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
     // Check that the NetState contains the rule's ith not_stage.
     bool has_stage = false;
     for (int j = 0; !has_stage && j < state.stage_size(); ++j) {
-      if (rule.not_stage(i) == state.stage(j)) { has_stage = true; }
+      if (rule.not_stage(i) == state.stage(j)) {
+        has_stage = true;
+      }
     }
     if (has_stage) {
       LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
-                << "' specified by a rule in layer " << layer_name;
+          << "' specified by a rule in layer " << layer_name;
       return false;
     }
   }
@@ -343,22 +350,25 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
 // layer_id == -1, tops have layer_id >= 0.)
 template <typename Dtype>
 void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
-                           const int top_id, set<string>* available_blobs,
-                           map<string, int>* blob_name_to_idx) {
-  shared_ptr<LayerParameter> layer_param((layer_id >= 0) ?
-    (new LayerParameter(param.layer(layer_id))) : NULL);
-  const string& blob_name = layer_param ?
-      (layer_param->top_size() > top_id ?
-          layer_param->top(top_id) : "(automatic)") : param.input(top_id);
+    const int top_id, set<string>* available_blobs,
+    map<string, int>* blob_name_to_idx) {
+  shared_ptr < LayerParameter
+      > layer_param(
+          (layer_id >= 0) ? (new LayerParameter(param.layer(layer_id))) : NULL);
+  const string& blob_name =
+      layer_param ?
+          (layer_param->top_size() > top_id ?
+              layer_param->top(top_id) : "(automatic)") :
+          param.input(top_id);
   // Check if we are doing in-place computation
-  if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id &&
-      blob_name == layer_param->bottom(top_id)) {
+  if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id
+      && blob_name == layer_param->bottom(top_id)) {
     // In-place computation
     LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
     top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
     top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
-  } else if (blob_name_to_idx &&
-             blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) {
+  } else if (blob_name_to_idx
+      && blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) {
     // If we are not doing in-place computation but have duplicated blobs,
     // raise an error.
     LOG(FATAL) << "Duplicate blobs produced by multiple sources.";
@@ -369,19 +379,20 @@ void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
     } else {
       LOG(INFO) << "Input " << top_id << " -> " << blob_name;
     }
-    shared_ptr<Blob<Dtype> > blob_pointer(new Blob<Dtype>());
+    shared_ptr < Blob<Dtype> > blob_pointer(new Blob<Dtype>());
     const int blob_id = blobs_.size();
     blobs_.push_back(blob_pointer);
     blob_names_.push_back(blob_name);
     blob_need_backward_.push_back(false);
-    if (blob_name_to_idx) { (*blob_name_to_idx)[blob_name] = blob_id; }
+    if (blob_name_to_idx) {
+      (*blob_name_to_idx)[blob_name] = blob_id;
+    }
     if (layer_id == -1) {
       // Set the (explicitly specified) dimensions of the input blob.
       if (param.input_dim_size() > 0) {
         blob_pointer->Reshape(param.input_dim(top_id * 4),
-                              param.input_dim(top_id * 4 + 1),
-                              param.input_dim(top_id * 4 + 2),
-                              param.input_dim(top_id * 4 + 3));
+            param.input_dim(top_id * 4 + 1), param.input_dim(top_id * 4 + 2),
+            param.input_dim(top_id * 4 + 3));
       } else {
         blob_pointer->Reshape(param.input_shape(top_id));
       }
@@ -392,7 +403,9 @@ void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
       top_vecs_[layer_id].push_back(blob_pointer.get());
     }
   }
-  if (available_blobs) { available_blobs->insert(blob_name); }
+  if (available_blobs) {
+    available_blobs->insert(blob_name);
+  }
 }
 
 // Helper for Net::Init: add a new bottom blob to the net.
@@ -403,8 +416,8 @@ int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
   const LayerParameter& layer_param = param.layer(layer_id);
   const string& blob_name = layer_param.bottom(bottom_id);
   if (available_blobs->find(blob_name) == available_blobs->end()) {
-    LOG(FATAL) << "Unknown blob input " << blob_name
-               << " (at index " << bottom_id << ") to layer " << layer_id;
+    LOG(FATAL) << "Unknown blob input " << blob_name << " (at index "
+        << bottom_id << ") to layer " << layer_id;
   }
   const int blob_id = (*blob_name_to_idx)[blob_name];
   LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
@@ -415,15 +428,14 @@ int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
   // Check if the backpropagation on bottom_id should be skipped
   if (layer_param.propagate_down_size() > 0)
     propagate_down = layer_param.propagate_down(bottom_id);
-  const bool need_backward = blob_need_backward_[blob_id] &&
-                          propagate_down;
+  const bool need_backward = blob_need_backward_[blob_id] && propagate_down;
   bottom_need_backward_[layer_id].push_back(need_backward);
   return blob_id;
 }
 
 template <typename Dtype>
 void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
-                             const int param_id) {
+    const int param_id) {
   const LayerParameter& layer_param = layers_[layer_id]->layer_param();
   const int param_size = layer_param.param_size();
   string param_name =
@@ -439,8 +451,9 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
   params_.push_back(layers_[layer_id]->blobs()[param_id]);
   param_id_vecs_[layer_id].push_back(net_param_id);
   param_layer_indices_.push_back(make_pair(layer_id, param_id));
-  if (!param_size || !param_name.size() || (param_name.size() &&
-      param_names_index_.find(param_name) == param_names_index_.end())) {
+  if (!param_size || !param_name.size()
+      || (param_name.size()
+          && param_names_index_.find(param_name) == param_names_index_.end())) {
     // This layer "owns" this parameter blob -- it is either anonymous
     // (i.e., not given a param_name) or explicitly given a name that we
     // haven't already seen.
@@ -452,19 +465,19 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
     // Named param blob with name we've seen before: share params
     const int owner_net_param_id = param_names_index_[param_name];
     param_owners_.push_back(owner_net_param_id);
-    const pair<int, int>& owner_index =
-        param_layer_indices_[owner_net_param_id];
+    const pair<int, int>& owner_index = param_layer_indices_[owner_net_param_id];
     const int owner_layer_id = owner_index.first;
     const int owner_param_id = owner_index.second;
     LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
-              << "layer '" << layer_names_[owner_layer_id] << "', param "
-              << "index " << owner_param_id;
-    Blob<Dtype>* this_blob = layers_[layer_id]->blobs()[param_id].get();
-    Blob<Dtype>* owner_blob =
+        << "layer '" << layer_names_[owner_layer_id] << "', param " << "index "
+        << owner_param_id;
+    Blob < Dtype > *this_blob = layers_[layer_id]->blobs()[param_id].get();
+    Blob < Dtype > *owner_blob =
         layers_[owner_layer_id]->blobs()[owner_param_id].get();
     const int param_size = layer_param.param_size();
-    if (param_size > param_id && (layer_param.param(param_id).share_mode() ==
-                                  ParamSpec_DimCheckMode_PERMISSIVE)) {
+    if (param_size > param_id
+        && (layer_param.param(param_id).share_mode()
+            == ParamSpec_DimCheckMode_PERMISSIVE)) {
       // Permissive dimension checking -- only check counts are the same.
       CHECK_EQ(this_blob->count(), owner_blob->count())
           << "Shared parameter blobs must have the same count.";
@@ -482,11 +495,11 @@ void Net<Dtype>::GetLearningRateAndWeightDecay() {
   LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
   ParamSpec default_param_spec;
   for (int i = 0; i < layers_.size(); ++i) {
-    vector<shared_ptr<Blob<Dtype> > >& layer_blobs = layers_[i]->blobs();
+    vector < shared_ptr<Blob<Dtype> > > &layer_blobs = layers_[i]->blobs();
     for (int j = 0; j < layer_blobs.size(); ++j) {
       const ParamSpec* param_spec =
           (layers_[i]->layer_param().param_size() > j) ?
-          &layers_[i]->layer_param().param(j) : &default_param_spec;
+              &layers_[i]->layer_param().param(j) : &default_param_spec;
       params_lr_.push_back(param_spec->lr_mult());
       params_weight_decay_.push_back(param_spec->decay_mult());
     }
@@ -503,12 +516,29 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
       InputDebugInfo(i);
     }
   }
+
+  CPUTimer forward_timer;
+  CPUTimer layer_timer;
+  forward_timer.Start();
+
   for (int i = start; i <= end; ++i) {
-    // LOG(ERROR) << "Forwarding " << layer_names_[i];
+    layer_timer.Start();
     Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
     loss += layer_loss;
-    if (debug_info_) { ForwardDebugInfo(i); }
+    if (debug_info_) {
+      ForwardDebugInfo(i);
+    }
+#ifndef CPU_ONLY
+    clFinish(amdDevice.CommandQueue);
+#endif
+    layer_timer.Stop();
+    printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
+        layer_timer.MilliSeconds());
   }
+
+  forward_timer.Stop();
+  printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds());
+
   return loss;
 }
 
@@ -567,13 +597,30 @@ template <typename Dtype>
 void Net<Dtype>::BackwardFromTo(int start, int end) {
   CHECK_GE(end, 0);
   CHECK_LT(start, layers_.size());
+
+  CPUTimer backward_timer;
+  CPUTimer layer_timer;
+  backward_timer.Start();
+
   for (int i = start; i >= end; --i) {
+    layer_timer.Start();
     if (layer_need_backward_[i]) {
-      layers_[i]->Backward(
-          top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
-      if (debug_info_) { BackwardDebugInfo(i); }
+      layers_[i]->Backward(top_vecs_[i], bottom_need_backward_[i],
+          bottom_vecs_[i]);
+      if (debug_info_) {
+        BackwardDebugInfo(i);
+      }
+#ifndef CPU_ONLY
+      clFinish(amdDevice.CommandQueue);
+#endif
+      layer_timer.Start();
+      printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(),
+          layer_timer.MilliSeconds());
     }
   }
+
+  backward_timer.Stop();
+  printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds());
 }
 
 template <typename Dtype>
@@ -581,8 +628,8 @@ void Net<Dtype>::InputDebugInfo(const int input_id) {
   const Blob<Dtype>& blob = *net_input_blobs_[input_id];
   const string& blob_name = blob_names_[net_input_blob_indices_[input_id]];
   const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-  LOG(INFO) << "    [Forward] "
-     << "Input " << blob_name << " data: " << data_abs_val_mean;
+  LOG(INFO) << "    [Forward] " << "Input " << blob_name << " data: "
+      << data_abs_val_mean;
 }
 
 template <typename Dtype>
@@ -591,19 +638,17 @@ void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
     const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
     const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Forward] "
-       << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
-       << " data: " << data_abs_val_mean;
+    LOG(INFO) << "    [Forward] " << "Layer " << layer_names_[layer_id]
+        << ", top blob " << blob_name << " data: " << data_abs_val_mean;
   }
   for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-       ++param_id) {
+      ++param_id) {
     const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
     const int net_param_id = param_id_vecs_[layer_id][param_id];
     const string& blob_name = param_display_names_[net_param_id];
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Forward] "
-       << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name
-       << " data: " << data_abs_val_mean;
+    LOG(INFO) << "    [Forward] " << "Layer " << layer_names_[layer_id]
+        << ", param blob " << blob_name << " data: " << data_abs_val_mean;
   }
 }
 
@@ -611,22 +656,24 @@ template <typename Dtype>
 void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
   const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
   for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
-    if (!bottom_need_backward_[layer_id][bottom_id]) { continue; }
+    if (!bottom_need_backward_[layer_id][bottom_id]) {
+      continue;
+    }
     const Blob<Dtype>& blob = *bottom_vec[bottom_id];
     const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
     const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    LOG(INFO) << "    [Backward] "
-        << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
-        << " diff: " << diff_abs_val_mean;
+    LOG(INFO) << "    [Backward] " << "Layer " << layer_names_[layer_id]
+        << ", bottom blob " << blob_name << " diff: " << diff_abs_val_mean;
   }
   for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
-       ++param_id) {
-    if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
+      ++param_id) {
+    if (!layers_[layer_id]->param_propagate_down(param_id)) {
+      continue;
+    }
     const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
     const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    LOG(INFO) << "    [Backward] "
-        << "Layer " << layer_names_[layer_id] << ", param blob " << param_id
-        << " diff: " << diff_abs_val_mean;
+    LOG(INFO) << "    [Backward] " << "Layer " << layer_names_[layer_id]
+        << ", param blob " << param_id << " diff: " << diff_abs_val_mean;
   }
 }
 
@@ -639,15 +686,14 @@ void Net<Dtype>::UpdateDebugInfo(const int param_id) {
   const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
   if (param_owner < 0) {
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Update] Layer " << layer_name
-        << ", param " << param_display_name
-        << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean;
+    LOG(INFO) << "    [Update] Layer " << layer_name << ", param "
+        << param_display_name << " data: " << data_abs_val_mean << "; diff: "
+        << diff_abs_val_mean;
   } else {
     const string& owner_layer_name =
         layer_names_[param_layer_indices_[param_owner].first];
-    LOG(INFO) << "    [Update] Layer " << layer_name
-        << ", param blob " << param_display_name
-        << " (owned by layer " << owner_layer_name << ", "
+    LOG(INFO) << "    [Update] Layer " << layer_name << ", param blob "
+        << param_display_name << " (owned by layer " << owner_layer_name << ", "
         << "param " << param_display_names_[param_owners_[param_id]] << ")"
         << " diff: " << diff_abs_val_mean;
   }
@@ -657,11 +703,11 @@ template <typename Dtype>
 void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
   int num_source_layers = other->layers().size();
   for (int i = 0; i < num_source_layers; ++i) {
-    Layer<Dtype>* source_layer = other->layers()[i].get();
+    Layer < Dtype > *source_layer = other->layers()[i].get();
     const string& source_layer_name = other->layer_names()[i];
     int target_layer_id = 0;
-    while (target_layer_id != layer_names_.size() &&
-        layer_names_[target_layer_id] != source_layer_name) {
+    while (target_layer_id != layer_names_.size()
+        && layer_names_[target_layer_id] != source_layer_name) {
       ++target_layer_id;
     }
     if (target_layer_id == layer_names_.size()) {
@@ -669,12 +715,12 @@ void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
       continue;
     }
     DLOG(INFO) << "Copying source layer " << source_layer_name;
-    vector<shared_ptr<Blob<Dtype> > >& target_blobs =
+    vector < shared_ptr<Blob<Dtype> > > &target_blobs =
         layers_[target_layer_id]->blobs();
     CHECK_EQ(target_blobs.size(), source_layer->blobs().size())
         << "Incompatible number of blobs for layer " << source_layer_name;
     for (int j = 0; j < target_blobs.size(); ++j) {
-      Blob<Dtype>* source_blob = source_layer->blobs()[j].get();
+      Blob < Dtype > *source_blob = source_layer->blobs()[j].get();
       CHECK(target_blobs[j]->shape() == source_blob->shape());
       target_blobs[j]->ShareData(*source_blob);
     }
@@ -697,7 +743,9 @@ void Net<Dtype>::Backward() {
   if (debug_info_) {
     Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0;
     for (int i = 0; i < params_.size(); ++i) {
-      if (param_owners_[i] >= 0) { continue; }
+      if (param_owners_[i] >= 0) {
+        continue;
+      }
       asum_data += params_[i]->asum_data();
       asum_diff += params_[i]->asum_diff();
       sumsq_data += params_[i]->sumsq_data();
@@ -725,8 +773,8 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
     const LayerParameter& source_layer = param.layer(i);
     const string& source_layer_name = source_layer.name();
     int target_layer_id = 0;
-    while (target_layer_id != layer_names_.size() &&
-        layer_names_[target_layer_id] != source_layer_name) {
+    while (target_layer_id != layer_names_.size()
+        && layer_names_[target_layer_id] != source_layer_name) {
       ++target_layer_id;
     }
     if (target_layer_id == layer_names_.size()) {
@@ -734,7 +782,7 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
       continue;
     }
     DLOG(INFO) << "Copying source layer " << source_layer_name;
-    vector<shared_ptr<Blob<Dtype> > >& target_blobs =
+    vector < shared_ptr<Blob<Dtype> > > &target_blobs =
         layers_[target_layer_id]->blobs();
     CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
         << "Incompatible number of blobs for layer " << source_layer_name;
@@ -779,11 +827,18 @@ void Net<Dtype>::Update() {
   // diff. (Assumes that the learning rate, weight decay, etc. have already been
   // accounted for in the current diff.)
   for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] < 0) { continue; }
-    if (debug_info_) { UpdateDebugInfo(i); }
+    if (param_owners_[i] < 0) {
+      continue;
+    }
+    if (debug_info_) {
+      UpdateDebugInfo(i);
+    }
     const int count = params_[i]->count();
     const Dtype* this_diff;
     Dtype* owner_diff;
+    this_diff = params_[i]->cpu_diff();
+    owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
+
     switch (Caffe::mode()) {
     case Caffe::CPU:
       this_diff = params_[i]->cpu_diff();
@@ -794,7 +849,8 @@ void Net<Dtype>::Update() {
 #ifndef CPU_ONLY
       this_diff = params_[i]->gpu_diff();
       owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
-      caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
+      // caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
+      caffe_gpu_axpy < Dtype > (count, 1.0, this_diff, owner_diff);
 #else
       NO_GPU;
 #endif
@@ -805,8 +861,12 @@ void Net<Dtype>::Update() {
   }
   // Now, update the owned parameters.
   for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] >= 0) { continue; }
-    if (debug_info_) { UpdateDebugInfo(i); }
+    if (param_owners_[i] >= 0) {
+      continue;
+    }
+    if (debug_info_) {
+      UpdateDebugInfo(i);
+    }
     params_[i]->Update();
   }
 }
@@ -819,11 +879,11 @@ bool Net<Dtype>::has_blob(const string& blob_name) const {
 template <typename Dtype>
 const shared_ptr<Blob<Dtype> > Net<Dtype>::blob_by_name(
     const string& blob_name) const {
-  shared_ptr<Blob<Dtype> > blob_ptr;
+  shared_ptr < Blob<Dtype> > blob_ptr;
   if (has_blob(blob_name)) {
     blob_ptr = blobs_[blob_names_index_.find(blob_name)->second];
   } else {
-    blob_ptr.reset((Blob<Dtype>*)(NULL));
+    blob_ptr.reset((Blob<Dtype>*) (NULL));
     LOG(WARNING) << "Unknown blob name " << blob_name;
   }
   return blob_ptr;
@@ -837,16 +897,16 @@ bool Net<Dtype>::has_layer(const string& layer_name) const {
 template <typename Dtype>
 const shared_ptr<Layer<Dtype> > Net<Dtype>::layer_by_name(
     const string& layer_name) const {
-  shared_ptr<Layer<Dtype> > layer_ptr;
+  shared_ptr < Layer<Dtype> > layer_ptr;
   if (has_layer(layer_name)) {
     layer_ptr = layers_[layer_names_index_.find(layer_name)->second];
   } else {
-    layer_ptr.reset((Layer<Dtype>*)(NULL));
+    layer_ptr.reset((Layer<Dtype>*) (NULL));
     LOG(WARNING) << "Unknown layer name " << layer_name;
   }
   return layer_ptr;
 }
 
-INSTANTIATE_CLASS(Net);
+INSTANTIATE_CLASS (Net);
 
 }  // namespace caffe
diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl
new file mode 100644
index 00000000..99d04575
--- /dev/null
+++ b/src/caffe/ocl/bnll_layer.cl
@@ -0,0 +1,52 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+#define kBNLL_THRESHOLD  50.0
+
+template <class T>
+__kernel void BNLLForward(const int n, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if (index < n) {
+    out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index]));
+  }
+}
+template __attribute__((mangled_name(BNLLForward_float))) __kernel void BNLLForward(const int n, __global const float* in, __global float* out);
+template __attribute__((mangled_name(BNLLForward_double))) __kernel void BNLLForward(const int n, __global const double* in, __global double* out);
+
+template <class T>
+__kernel void BNLLBackward(const int n, __global const T* in_diff,
+    __global const T* in_data, __global T* out_diff) {
+  int index = get_global_id(0);
+  if (index < n) {
+    T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD)));
+    out_diff[index] = in_diff[index] * expval / (expval + 1.);
+  }
+}
+
+template __attribute__((mangled_name(BNLLBackward_float))) __kernel void BNLLBackward(const int n, __global const float* in_diff,
+    __global const float* in_data, __global float* out_diff);
+template __attribute__((mangled_name(BNLLBackward_double))) __kernel void BNLLBackward(const int n, __global const double* in_diff,
+    __global const double* in_data, __global double* out_diff);
diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl
new file mode 100644
index 00000000..a9663fce
--- /dev/null
+++ b/src/caffe/ocl/concat_layer.cl
@@ -0,0 +1,54 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void Concat(const int nthreads, __global const T* in_data,
+    const int forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global T* out_data) {
+  int index = get_global_id(0);
+  if(index < nthreads) {
+    const int total_concat_size = concat_size * bottom_concat_axis;
+    const int concat_num = index / total_concat_size;
+    const int concat_index = index % total_concat_size;
+    const int top_index = concat_index +
+    (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+    if (forward == 1) {
+      out_data[top_index] = in_data[index];
+    } else {
+      out_data[index] = in_data[top_index];
+    }
+  }
+}
+
+template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data,
+    const int forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global float* out_data);
+template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data,
+    const int forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, __global double* out_data);
diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl
new file mode 100644
index 00000000..477f2ff4
--- /dev/null
+++ b/src/caffe/ocl/contrastive_loss_layer.cl
@@ -0,0 +1,64 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class Dtype>
+__kernel void CLLBackward(const int count, const int channels,
+    const Dtype margin, const bool legacy_version, const Dtype alpha,
+    __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq,
+    __global Dtype *bottom_diff) {
+  int i = get_global_id(0);
+  if(i < count) {
+    int n = i / channels;  // the num index, to access y and dist_sq
+    if (static_cast<int>(y[n])) {  // similar pairs
+      bottom_diff[i] = alpha * diff[i];
+    } else {  // dissimilar pairs
+      Dtype mdist(0.0);
+      Dtype beta(0.0);
+      if (legacy_version) {
+        mdist = (margin - dist_sq[n]);
+        beta = -alpha;
+      } else {
+        Dtype dist = sqrt(dist_sq[n]);
+        mdist = (margin - dist);
+        beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
+      }
+      if (mdist > 0.0) {
+        bottom_diff[i] = beta;
+      } else {
+        bottom_diff[i] = 0;
+      }
+    }
+  }
+}
+
+template __attribute__((mangled_name(CLLBackward_float))) __kernel void CLLBackward(const int count, const int channels,
+    const float margin, const bool legacy_version, const float alpha,
+    __global const float* y, __global const float* diff, __global const float* dist_sq,
+    __global float *bottom_diff);
+template __attribute__((mangled_name(CLLBackward_double))) __kernel void CLLBackward(const int count, const int channels,
+    const double margin, const bool legacy_version, const double alpha,
+    __global const double* y, __global const double* diff, __global const double* dist_sq,
+    __global double *bottom_diff);
diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl
new file mode 100644
index 00000000..98d44f86
--- /dev/null
+++ b/src/caffe/ocl/dropout_layer.cl
@@ -0,0 +1,45 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void DropoutForward(const int n, __global T *in, __global const unsigned int* mask, const unsigned int threshold,  const float scale, __global T *out) {
+	int index = get_global_id(0);
+	if (index < n) {
+	    out[index] = in[index] * scale * (mask[index] > threshold);
+        }
+}
+template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global float* out);
+template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global double* out);
+
+template <class T>
+__kernel void DropoutBackward(const int n, __global T *in_diff, __global const unsigned int *mask, const unsigned int threshold, const float scale, __global T *out_diff) {
+	int index = get_global_id(0);
+	if (index < n) {
+	    out_diff[index] = in_diff[index] * scale * (mask[index] > threshold);
+        }
+}
+template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global float* out_diff);
+template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global double* out_diff);
diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl
new file mode 100644
index 00000000..88137dd7
--- /dev/null
+++ b/src/caffe/ocl/eltwise_layer.cl
@@ -0,0 +1,73 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class Dtype>
+__kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a,
+    __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data,
+    __global int* mask) {
+  int index = get_global_id(0);
+  if(index < nthreads) {
+    Dtype maxval = -FLT_MAX;
+    int maxidx = -1;
+    if (bottom_data_a[index] > bottom_data_b[index]) {
+      // only update for very first bottom_data blob (blob_idx == 0)
+      if (blob_idx == 0) {
+        maxval = bottom_data_a[index];
+        top_data[index] = maxval;
+        maxidx = blob_idx;
+        mask[index] = maxidx;
+      }
+    } else {
+      maxval = bottom_data_b[index];
+      top_data[index] = maxval;
+      maxidx = blob_idx + 1;
+      mask[index] = maxidx;
+    }
+  }
+}
+template __attribute__((mangled_name(MaxForward_float))) __kernel void MaxForward(const int nthreads, __global const float* bottom_data_a,
+    __global const float* bottom_data_b, const int blob_idx, __global float* top_data,
+    __global int* mask);
+template __attribute__((mangled_name(MaxForward_double))) __kernel void MaxForward(const int nthreads, __global const double* bottom_data_a,
+    __global const double* bottom_data_b, const int blob_idx, __global double* top_data,
+    __global int* mask);
+
+template <class Dtype>
+__kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff,
+    const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) {
+  int index = get_global_id(0);
+  if(index < nthreads) {
+    Dtype gradient = 0;
+    if (mask[index] == blob_idx) {
+      gradient += top_diff[index];
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+template __attribute__((mangled_name(MaxBackward_float))) __kernel void MaxBackward(const int nthreads, __global const float* top_diff,
+    const int blob_idx, __global const int* mask, __global float* bottom_diff);
+template __attribute__((mangled_name(MaxBackward_double))) __kernel void MaxBackward(const int nthreads, __global const double* top_diff,
+    const int blob_idx, __global const int* mask, __global double* bottom_diff);
diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl
new file mode 100644
index 00000000..f1a97eab
--- /dev/null
+++ b/src/caffe/ocl/im2col.cl
@@ -0,0 +1,231 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+template <class T>
+__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) {
+
+  int index = get_global_id(0);
+
+  data_im = data_im + img_offset;
+  data_col = data_col + col_offset;
+
+  int x_out = index % width_col;
+  int y_out = (index / width_col) % height_col;
+  int channel_in = (index / width_col / height_col) % channels;
+  int channel_out = channel_in * kernel_h * kernel_w;
+  int im_id = index / width_col / height_col / channels;
+
+  int y_in = y_out * stride_h - pad_h;
+  int x_in = x_out * stride_w - pad_w;
+  int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col;
+  int offset_im = im_id * channels * height * width + channel_in * height * width;
+
+  for(int k_h = 0; k_h < kernel_h; k_h++) {
+    for(int k_w = 0; k_w < kernel_w; k_w++) {
+      int x_im = x_in + k_w;
+      int y_im = y_in + k_h;
+      int index_im = y_im * width + x_im;
+      int index_col = (k_h * kernel_w + k_w) * optnum * height_col * width_col + y_out * width_col + x_out;
+      if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width)
+      data_col[offset_col + index_col] = data_im[offset_im + index_im];
+      else
+      data_col[offset_col + index_col] = 0;
+    }
+  }
+}
+
+template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum);
+template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int tride_h, const int stride_w, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum);
+
+template <class T>
+__kernel void im2col(const int n, __global const T* data_im, const int img_offset,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    __global T* data_col, const int col_offset) {
+  data_im = data_im + img_offset;
+  data_col = data_col + col_offset;
+
+  int index = get_global_id(0);
+  if(index < n) {
+    int w_out = index % width_col;
+    int h_index = index / width_col;
+    int h_out = h_index % height_col;
+    int channel_in = h_index / height_col;
+    int channel_out = channel_in * kernel_h * kernel_w;
+    int h_in = h_out * stride_h - pad_h;
+    int w_in = w_out * stride_w - pad_w;
+    __global T* data_col_ptr = data_col;
+    data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+    __global const T* data_im_ptr = data_im;
+    data_im_ptr += (channel_in * height + h_in) * width + w_in;
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        int h = h_in + i;
+        int w = w_in + j;
+        *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+        data_im_ptr[i * width + j] : 0;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+template __attribute__((mangled_name(im2col_float))) void im2col<float>(const int n, __global const float* data_im,
+    const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int height_col, const int width_col, __global float* data_col, const int col_offset);
+template __attribute__((mangled_name(im2col_double))) void im2col<double>(const int n, __global const double* data_im,
+    const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int height_col, const int width_col, __global double* data_col, const int col_offset);
+
+template <class T>
+__kernel void col2im(const int n, __global const T* data_col, const int col_offset,
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int height_col, const int width_col,
+    __global T* data_im, const int img_offset) {
+  data_col = data_col + col_offset;
+  data_im = data_im + img_offset;
+  int index = get_global_id(0);
+  if(index < n) {
+    T val = 0;
+    int w = index % width + pad_w;
+    int h = (index / width) % height + pad_h;
+    int c = index / (width * height);
+    // compute the start and end of the output
+    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+    int w_col_end = min(w / stride_w + 1, width_col);
+    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+    int h_col_end = min(h / stride_h + 1, height_col);
+    // equivalent implementation
+    int offset =
+    (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
+    int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
+    int coeff_w_col = (1 - stride_w * height_col * width_col);
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+      }
+    }
+    data_im[index] = val;
+  }
+}
+
+template __attribute__((mangled_name(col2im_float))) __kernel void col2im(const int n, __global const float* data_col, const int col_offset,
+    const int height, const int width, const int channels,
+    const int patch_h, const int patch_w,const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,const int height_col, const int width_col,
+    __global float* data_im, const int img_offset);
+template __attribute__((mangled_name(col2im_double))) __kernel void col2im(const int n, __global const double* data_col,
+    const int col_offset, const int height, const int width, const int channels,
+    const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset);
+
+template <class T>
+__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, 
+const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) {
+  int index = get_global_id(0);
+  data_col = data_col + col_offset;
+  data_im = data_im + img_offset;
+  if(index < n) {
+    T val = 0;
+    int w = index % width + pad_w;
+    int h = (index / width) % height + pad_h;
+    int c = index / (width * height) % channels;
+    int im = index / width / height / channels;
+    // compute the start and end of the output
+    int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    int w_col_end = min(w / stride_w + 1, width_col);
+    int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    int h_col_end = min(h / stride_h + 1, height_col);
+    // equivalent implementation
+    int offset = (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col * optnum + im * height_col * width_col;
+    int coeff_h_col = (1 - stride_h * kernel_w * height_col * optnum) * width_col;
+    int coeff_w_col = (1 - stride_w * height_col * width_col * optnum);
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+      }
+    }
+    data_im[index] = val;
+  }
+}
+template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum);
+template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum);
+
+template <class T>
+__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum) {
+
+  int index = get_global_id(0);
+  data_opt = data_opt + opt_offset;
+  data_im = data_im + im_offset;
+  if(index < n) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = index / (width * height) % channels;
+    int im = index / width / height / channels;
+
+    int opt_index = c * height * optnum * width + h * optnum * width + im * width + w;
+    data_opt[opt_index] = data_im[index];
+  }
+}
+template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum);
+template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum);
+
+template <class T>
+__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum) {
+  int gidx = get_global_id(0);
+  int gidy = get_global_id(1);
+  int gidyy = gidy;
+  int index = gidy / height;
+  int offset = index * width * height;
+  gidy = gidy % height;
+  if( gidx < width && gidyy < height * optnum )
+  dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx];
+}
+template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum);
+template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum);
+
+template <class T>
+__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum) {
+  int gidx = get_global_id(0);
+  int index;
+  index = (optnum==1) ? 0: gidx % optnum;
+  dst = dst + top_offset; // now we point at (*top)[n]
+  int offset = gidx / optnum;
+  int i = 0;
+  for(i = 0; i < width; i++)
+  dst[(index * height + offset)* width + i] = src[gidx * width + i];
+}
+template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum);
+template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum);
diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl
new file mode 100644
index 00000000..67eed4ae
--- /dev/null
+++ b/src/caffe/ocl/lrn_layer.cl
@@ -0,0 +1,139 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp)
+  out[index] = in[index] * pow(scale[index], negative_beta);
+}
+template __attribute__((mangled_name(LRNComputeOutput_float))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out);
+template __attribute__((mangled_name(LRNComputeOutput_double))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out);
+
+template <class T>
+__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    in = in + offset;
+    scale = scale + offset;
+    int head = 0;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    T accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+        * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+        * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+  }
+}
+template __attribute__((mangled_name(LRNFillScale_float))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale);
+template __attribute__((mangled_name(LRNFillScale_double))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale);
+
+template <class T>
+__kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    bottom_data += offset;
+    top_data += offset;
+    scale += offset;
+    top_diff += offset;
+    bottom_diff += offset;
+    int head = 0;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    T accum_ratio = 0;
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+      scale[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_ratio += top_diff[head * step] * top_data[head * step] /
+      scale[head * step];
+      if (head - size >= 0) {
+        accum_ratio -= top_diff[(head - size) * step] *
+        top_data[(head - size) * step] / scale[(head - size) * step];
+      }
+      bottom_diff[(head - post_pad) * step] =
+      top_diff[(head - post_pad) * step]
+      * pow(scale[(head - post_pad) * step], negative_beta)
+      - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_ratio -= top_diff[(head - size) * step] *
+        top_data[(head - size) * step] / scale[(head - size) * step];
+      }
+      bottom_diff[(head - post_pad) * step] =
+      top_diff[(head - post_pad) * step]
+      * pow(scale[(head - post_pad) * step], negative_beta)
+      - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio;
+      ++head;
+    }
+  }
+}
+
+template __attribute__((mangled_name(LRNComputeDiff_float))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff);
+template __attribute__((mangled_name(LRNComputeDiff_double))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff);
diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl
new file mode 100644
index 00000000..49a1413a
--- /dev/null
+++ b/src/caffe/ocl/pooling_layer.cl
@@ -0,0 +1,293 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T maxval = -FLT_MAX;
+    int maxidx = -1;
+    bottom_data =
+    bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        if (bottom_data[h * width + w] > maxval) {
+          maxidx = h * width + w;
+          maxval = bottom_data[maxidx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+    if (mask) {
+      mask[index] = maxidx;
+    } else {
+      top_mask[index] = maxidx;
+    }
+  }
+}
+template __attribute__((mangled_name(MaxPoolForward_float))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask);
+template __attribute__((mangled_name(MaxPoolForward_double))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask);
+
+template <class T>
+__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index+=tmp) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + kernel_h, height + pad_h);
+    int wend = min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (hend - hstart) * (wend - wstart);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend = min(hend, height);
+    wend = min(wend, width);
+    T aveval = 0;
+    bottom_data =
+    bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += bottom_data[h * width + w];
+      }
+    }
+    top_data[index] = aveval / pool_size;
+  }
+
+}
+template __attribute__((mangled_name(AvePoolForward_float))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data);
+template __attribute__((mangled_name(AvePoolForward_double))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data);
+
+template <class T>
+__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index+=tmp) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    const int hstart = ph * stride_h;
+    const int hend = min(hstart + kernel_h, height);
+    const int wstart = pw * stride_w;
+    const int wend = min(wstart + kernel_w, width);
+    T cumsum = 0.;
+    bottom_data = bottom_data + (n * channels + c) * height * width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_data[h * width + w];
+      }
+    }
+    const float thres = rand_idx[index] * cumsum;
+    // Second pass: get value, and set index.
+    cumsum = 0;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_data[h * width + w];
+        if (cumsum >= thres) {
+          rand_idx[index] = ((n * channels + c) * height + h) * width + w;
+          top_data[index] = bottom_data[h * width + w];
+          return;
+        }
+      }
+    }
+  }
+}
+template __attribute__((mangled_name(StoPoolForwardTrain_float))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data);
+template __attribute__((mangled_name(StoPoolForwardTrain_double))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data);
+
+template <class T>
+__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < count; index+=tmp) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    const int hstart = ph * stride_h;
+    const int hend = min(hstart + kernel_h, height);
+    const int wstart = pw * stride_w;
+    const int wend = min(wstart + kernel_w, width);
+    // We set cumsum to be 0 to avoid divide-by-zero problems    T cumsum = FLT_MIN;
+    T cumsum = FLT_MIN;
+    T cumvalues = 0.;
+    bottom_data = bottom_data + (n * channels + c) * height * width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_data[h * width + w];
+        cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w];
+      }
+    }
+    top_data[index] = cumvalues / cumsum;}
+}
+template __attribute__((mangled_name(StoPoolForwardTest_float))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data);
+template __attribute__((mangled_name(StoPoolForwardTest_double))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data);
+
+template <class T>
+__kernel void MaxPoolBackward(const int nthreads, __global T* top_diff,
+    __global int* mask, __global T* top_mask, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, __global T* const bottom_diff) {
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  for(index; index < nthreads; index += total) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart =
+    (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
+    const int pwstart =
+    (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+    T gradient = 0;
+    const int offset = (n * channels + c) * pooled_height * pooled_width;
+    top_diff += offset;
+    if (mask) {
+      mask = mask + offset;
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (mask[ph * pooled_width + pw] == h * width + w) {
+            gradient += top_diff[ph * pooled_width + pw];
+          }
+        }
+      }
+    } else {
+      top_mask = top_mask + offset;
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (top_mask[ph * pooled_width + pw] == h * width + w) {
+            gradient += top_diff[ph * pooled_width + pw];
+          }
+        }
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+template __attribute__((mangled_name(MaxPoolBackward_float))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
+template __attribute__((mangled_name(MaxPoolBackward_double))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
+
+template <class T>
+__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff) {
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  for(index; index < nthreads; index += total) {
+    int w = index % width + pad_w;
+    int h = (index / width) % height + pad_h;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    T gradient = 0;
+    top_diff += (n * channels + c) * pooled_height * pooled_width;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_h;
+        int wstart = pw * stride_w - pad_w;
+        int hend = min(hstart + kernel_h, height + pad_h);
+        int wend = min(wstart + kernel_w, width + pad_w);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        gradient += top_diff[ph * pooled_width + pw] / pool_size;
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template __attribute__((mangled_name(AvePoolBackward_float))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff);
+template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff);
+
+template <class Dtype>
+__kernel void StoPoolBackward(const int nthreads,
+    __global Dtype* rand_idx, __global Dtype* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, __global Dtype* bottom_diff) {
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  for(index; index < nthreads; index += total) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    Dtype gradient = 0;
+    rand_idx =
+    rand_idx + (n * channels + c) * pooled_height * pooled_width;
+    top_diff =
+    top_diff + (n * channels + c) * pooled_height * pooled_width;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        gradient += top_diff[ph * pooled_width + pw] *
+        (index == static_cast<int>(rand_idx[ph * pooled_width + pw]));
+      }
+    }
+    bottom_diff[index] = gradient;
+
+  }
+}
+template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads,
+    __global float* rand_idx, __global float* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, __global float* bottom_diff);
+template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward(const int nthreads,
+    __global double* rand_idx, __global double* top_diff,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, __global double* bottom_diff);
diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl
new file mode 100644
index 00000000..caff18b9
--- /dev/null
+++ b/src/caffe/ocl/prelu_layer.cl
@@ -0,0 +1,60 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void PReLUForward(const int count, const int channels, const int dim, __global T* in, __global T* out, __global T* slope_data, const int div_factor) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int c = (index / dim) % channels / div_factor;
+    out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
+  }
+}
+template __attribute__ ((mangled_name(PReLUForward_float))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global float* in, __global float* out, __global float* slope_data, const int div_factor);
+template __attribute__ ((mangled_name(PReLUForward_double))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global double* in, __global double* out, __global double* slope_data, const int div_factor);
+
+template <class T>
+__kernel void PReLUBackward(const int count, const int channels, const int dim, __global T* in_diff, __global T* in_data, __global T* out_diff, __global T* slope_data, const int div_factor) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int c = (index / dim) % channels / div_factor;
+    out_diff[index] = in_diff[index] * ((in_data[index] > 0)
+        + (in_data[index] <= 0) * slope_data[c]);
+  }
+}
+template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global float* in_diff, __global float* in_data, __global float* out_diff, __global float* slope_data, const int div_factor);
+template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor);
+
+template <class T>
+__kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_in_diff, __global T* in_data, const int offset_in_data, __global T* out_diff) {
+  int index = get_global_id(0);
+  if(index < count) {
+    in_diff += offset_in_diff;
+    in_data += offset_in_data;
+    out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0);
+  }
+}
+template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_in_diff, __global float* in_data, const int offset_in_data, __global float* out_diff);
+template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_in_diff, __global double* in_data, const int offset_in_data, __global double* out_diff);
diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl
new file mode 100644
index 00000000..468240f0
--- /dev/null
+++ b/src/caffe/ocl/random.cl
@@ -0,0 +1,960 @@
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+//Note: random generator has two parts
+//first part: the open sourced threefy random generator kernel from DE Shaw Research
+//second part. we wrapp the kernel up to generate uniform, bernoulli and gaussion distribution generators.
+
+//begin: the open sourced random generator from DE Shaw Research
+//https://www.deshawresearch.com/resources_random123.html
+typedef uint uint32_t;
+
+struct r123array4x32 {
+    uint32_t v[4];
+};
+
+enum r123_enum_threefry32x4 {
+  R_32x4_0_0 = 10,
+  R_32x4_0_1 = 26,
+  R_32x4_1_0 = 11,
+  R_32x4_1_1 = 21,
+  R_32x4_2_0 = 13,
+  R_32x4_2_1 = 27,
+  R_32x4_3_0 = 23,
+  R_32x4_3_1 = 5,
+  R_32x4_4_0 = 6,
+  R_32x4_4_1 = 20,
+  R_32x4_5_0 = 17,
+  R_32x4_5_1 = 11,
+  R_32x4_6_0 = 25,
+  R_32x4_6_1 = 10,
+  R_32x4_7_0 = 18,
+  R_32x4_7_1 = 20
+};
+
+inline uint32_t RotL_32(uint32_t x, unsigned int N)
+    __attribute__((always_inline));
+inline uint32_t RotL_32(uint32_t x, unsigned int N) {
+  return (x << (N & 31)) | (x >> ((32 - N) & 31));
+}
+
+typedef struct r123array4x32 threefry4x32_ctr_t;
+typedef struct r123array4x32 threefry4x32_key_t;
+typedef struct r123array4x32 threefry4x32_ukey_t;
+
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
+    threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline));
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds,
+    threefry4x32_ctr_t in, threefry4x32_key_t k) {
+  threefry4x32_ctr_t X;
+  uint32_t ks[4 + 1];
+  int i;
+  ks[4] = 0x1BD11BDA;
+  /*
+   for (i = 0; i < 4; i++)
+   {
+   ks[i] = k.v[i];
+   X.v[i] = in.v[i];
+   ks[4] ^= k.v[i];
+   }*/
+  {
+    ks[0] = k.v[0];
+    X.v[0] = in.v[0];
+    ks[4] ^= k.v[0];
+
+    ks[1] = k.v[1];
+    X.v[1] = in.v[1];
+    ks[4] ^= k.v[1];
+
+    ks[2] = k.v[2];
+    X.v[2] = in.v[2];
+    ks[4] ^= k.v[2];
+
+    ks[3] = k.v[3];
+    X.v[3] = in.v[3];
+    ks[4] ^= k.v[3];
+  }
+  X.v[0] += ks[0];
+  X.v[1] += ks[1];
+  X.v[2] += ks[2];
+  X.v[3] += ks[3];
+  if (Nrounds > 0) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 1) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 2) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 3) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 3) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 1;
+  }
+  if (Nrounds > 4) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 5) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 6) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 7) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 7) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 2;
+  }
+  if (Nrounds > 8) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 9) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 10) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 11) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 11) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 3;
+  }
+  if (Nrounds > 12) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 13) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 14) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 15) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 15) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 4;
+  }
+  if (Nrounds > 16) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 17) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 18) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 19) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 19) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 5;
+  }
+  if (Nrounds > 20) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 21) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 22) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 23) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 23) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 6;
+  }
+  if (Nrounds > 24) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 25) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 26) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 27) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 27) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 7;
+  }
+  if (Nrounds > 28) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 29) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 30) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 31) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 31) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 8;
+  }
+  if (Nrounds > 32) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 33) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 34) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 35) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 35) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 9;
+  }
+  if (Nrounds > 36) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 37) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 38) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 39) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 39) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 10;
+  }
+  if (Nrounds > 40) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 41) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 42) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 43) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 43) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 11;
+  }
+  if (Nrounds > 44) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 45) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 46) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 47) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 47) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 12;
+  }
+  if (Nrounds > 48) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 49) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 50) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 51) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 51) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 13;
+  }
+  if (Nrounds > 52) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 53) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 54) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 55) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 55) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 14;
+  }
+  if (Nrounds > 56) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 57) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 58) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 59) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 59) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 15;
+  }
+  if (Nrounds > 60) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 61) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 62) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 63) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 63) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 16;
+  }
+  if (Nrounds > 64) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 65) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 66) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 67) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 67) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 17;
+  }
+  if (Nrounds > 68) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 69) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 70) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+  if (Nrounds > 71) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 71) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 18;
+  }
+  return X;
+}
+//end: the open sourced random generator from DE Shaw Research
+
+template <class T>
+__kernel void PRNG_threefry4x32_bernoulli(
+    __global uint4 *randomnumber,
+    threefry4x32_ctr_t ctr_i,
+    T inf,
+    T sup,
+    T threshold,
+    uint nrounds,
+    uint numrandom
+) {
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom )
+  {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    uint4 frnd;
+
+    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0;
+
+    randomnumber[gdx] = frnd;
+  }
+}
+
+template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm);
+
+template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm);
+
+//end of the looooooong gpu_random_generator kernel 
+
+//We wrapp the kernel up to generate uniform, bernoulli and gaussion distribution generators.
+
+template <class T>
+__kernel void PRNG_threefry4x32_uniform(
+    __global float4 *randomnumber,
+    threefry4x32_ctr_t ctr_i,
+    T inf,
+    T sup,
+    uint nrounds,
+    uint numrandom
+) {
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom )
+  {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    float4 frnd;
+    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf );
+    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf );
+    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf );
+    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf );
+    randomnumber[gdx] = frnd;
+  }
+}
+
+template __attribute__((mangled_name(RNGUniform_float))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, uint nrounds, uint numrandonm);
+
+template __attribute__((mangled_name(RNGUniform_double))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, uint nrounds, uint numrandonm);
+
+__kernel void PRNG_threefry4x32_uint_uniform(
+    __global uint4 *randomnumber,
+    threefry4x32_ctr_t ctr_i,
+    uint inf,
+    uint sup,
+    uint nrounds,
+    uint numrandom
+) {
+  size_t gdx = get_global_id(0);
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom )
+  {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    uint4 frnd;
+    frnd.x = random4.v[0] % (sup - inf) + inf;
+    frnd.y = random4.v[1] % (sup - inf) + inf;
+    frnd.z = random4.v[2] % (sup - inf) + inf;
+    frnd.w = random4.v[3] % (sup - inf) + inf;
+    randomnumber[gdx] = frnd;
+  }
+}
+
+template <class T>
+__kernel void PRNG_threefry4x32_gaussian(
+    __global float4 *randomnumber,
+    threefry4x32_ctr_t ctr_i,
+    T E,
+    T V,
+    uint nrounds,
+    uint numrandom
+) {
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey1, ukey2;
+
+  ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx;
+  ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0;
+
+  threefry4x32_ctr_t random1, random2;
+
+  if ( gdx < numrandom )
+  {
+    random1 = threefry4x32_R(nrounds, ctr, ukey1);
+    random2 = threefry4x32_R(nrounds, ctr, ukey2);
+    float4 frnd1;
+
+    float r1 = (((float)random1.v[0]) / r); // generate a random sequence of uniform distribution
+    float r2 = (((float)random2.v[0]) / r);
+    float r3 = (((float)random1.v[1]) / r);
+    float r4 = (((float)random2.v[1]) / r);
+    float r5 = (((float)random1.v[2]) / r);
+    float r6 = (((float)random2.v[2]) / r);
+    float r7 = (((float)random1.v[3]) / r);
+    float r8 = (((float)random2.v[3]) / r);
+
+    if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0) {
+      r2 += 0.0001;
+      r4 += 0.0001;
+      r6 += 0.0001;
+      r8 += 0.0001;
+    }
+
+    frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+
+    randomnumber[gdx] = frnd1;
+  }
+}
+
+template __attribute__((mangled_name(RNGGaussian_float))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float E, float V, uint nrounds, uint numrandonm);
+
+template __attribute__((mangled_name(RNGGaussian_double))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double E, double V, uint nrounds, uint numrandonm);
+
diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl
new file mode 100644
index 00000000..e39aa426
--- /dev/null
+++ b/src/caffe/ocl/relu_layer.cl
@@ -0,0 +1,46 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = in[index] > 0? in[index]:in[index]*negative_slope;
+}
+
+template __attribute__ ((mangled_name(ReLUForward_float))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope);
+template __attribute__ ((mangled_name(ReLUForward_double))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope);
+
+template <class T>
+__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope) {
+  int index = get_global_id(0);
+  if(index < count) {
+    out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope);
+  }
+}
+
+template __attribute__ ((mangled_name(ReLUBackward_float))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope);
+template __attribute__ ((mangled_name(ReLUBackward_double))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope);
diff --git a/src/caffe/ocl/sigmoid_layer.cl b/src/caffe/ocl/sigmoid_layer.cl
new file mode 100644
index 00000000..ac0ef9a9
--- /dev/null
+++ b/src/caffe/ocl/sigmoid_layer.cl
@@ -0,0 +1,46 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void SigmoidForward(const int count, __global T* in, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = 1. / (1. + exp(-in[index]));
+}
+
+template __attribute__ ((mangled_name(SigmoidForward_float))) __kernel void SigmoidForward(const int count, __global float* in, __global float* out);
+template __attribute__ ((mangled_name(SigmoidForward_double))) __kernel void SigmoidForward(const int count, __global double* in, __global double* out);
+
+template <class T>
+__kernel void SigmoidBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) {
+  int index = get_global_id(0);
+  const T sigmoid_x = out_data[index];
+  if(index < count)
+  out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x);
+}
+
+template __attribute__ ((mangled_name(SigmoidBackward_float))) __kernel void SigmoidBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff);
+template __attribute__ ((mangled_name(SigmoidBackward_double))) __kernel void SigmoidBackward(const int count, __global double* in_diff, __global double* out_data, __global double* out_diff);
diff --git a/src/caffe/ocl/slice_layer.cl b/src/caffe/ocl/slice_layer.cl
new file mode 100644
index 00000000..26c6bb34
--- /dev/null
+++ b/src/caffe/ocl/slice_layer.cl
@@ -0,0 +1,28 @@
+template <class Dtype>
+__kernel void Slice(const int nthreads, __global const Dtype* in_data,
+    const int forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, __global Dtype* out_data) {
+    int index = get_global_id(0);
+    if (index < nthreads) {
+        const int total_slice_size = slice_size * top_slice_axis;
+        const int slice_num = index / total_slice_size;
+        const int slice_index = index % total_slice_size;
+        const int bottom_index = slice_index +
+            (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
+        if (forward) {
+            out_data[index] = in_data[bottom_index];
+        } else {
+            out_data[bottom_index] = in_data[index];
+        }
+  }
+}
+
+template __attribute__ ((mangled_name(Slice_float))) __kernel void Slice(const int nthreads, __global const float* in_data,
+    const int forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, __global float* out_data);
+template __attribute__ ((mangled_name(Slice_double))) __kernel void Slice(const int nthreads, __global const double* in_data,
+    const int forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, __global double* out_data);
diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl
new file mode 100644
index 00000000..207f0058
--- /dev/null
+++ b/src/caffe/ocl/softmax_layer.cl
@@ -0,0 +1,171 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch) {
+
+  int gid = get_global_id(0);
+  int size = get_global_size(0);
+
+  resultScratch[gid] = 0.0;
+  for(int i = gid; i < num; i += size) {
+    resultScratch[gid] += -log(prob_data[i * dim + static_cast<int>(label[i])]);
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  if(gid < 128)
+  resultScratch[gid] += resultScratch[gid + 128];
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if(gid < 64)
+  resultScratch[gid] += resultScratch[gid + 64];
+  if(gid < 32)
+  resultScratch[gid] += resultScratch[gid + 32];
+  if(gid < 16)
+  resultScratch[gid] += resultScratch[gid + 16];
+  if(gid < 8)
+  resultScratch[gid] += resultScratch[gid + 8];
+  if(gid < 4)
+  resultScratch[gid] += resultScratch[gid + 4];
+  if(gid < 2)
+  resultScratch[gid] += resultScratch[gid + 2];
+  if(gid < 1) {
+    resultScratch[gid] += resultScratch[gid + 1];
+    loss[0] = resultScratch[gid];
+  }
+}
+template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch);
+template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch);
+
+template <class T>
+__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data) {
+  //printf("softmax_div\n");
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  for(index; index < num*dim; index += total) {
+    int n = index / dim;
+    data[index] /= scale[n];
+  }
+}
+
+template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data);
+template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softmax_div (const int num, const int dim, __global double* scale, __global double* data);
+
+template <class T>
+__kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* out) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T maxval = -FLT_MAX;
+    for (int c = 0; c < channels; ++c) {
+      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
+    }
+    out[index] = maxval;
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const float* data, __global float* out);
+template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const double* data, __global double* out);
+
+template <class T>
+__kernel void kernel_channel_subtract(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_max, __global T* data) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] -= channel_max[n * spatial_dim + s];
+  }
+}
+template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data);
+template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data);
+
+template <class T>
+__kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* channel_sum) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T sum = 0;
+    for (int c = 0; c < channels; ++c) {
+      sum += data[(n * channels + c) * spatial_dim + s];
+    }
+    channel_sum[index] = sum;
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const float* data, __global float* channel_sum);
+template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const double* data, __global double* channel_sum);
+
+template <class T>
+__kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_sum, __global T* data) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] /= channel_sum[n * spatial_dim + s];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const float* channel_sum, __global float* data);
+template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const double* channel_sum, __global double* data);
+
+template <class T>
+__kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const T* data_1, __global const T* data_2,
+    __global T* channel_dot) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T dot = 0;
+    for (int c = 0; c < channels; ++c) {
+      dot += (data_1[(n * channels + c) * spatial_dim + s]
+          * data_2[(n * channels + c) * spatial_dim + s]);
+    }
+    channel_dot[index] = dot;
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const float* data_1, __global const float* data_2,
+    __global float* channel_dot);
+template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, __global const double* data_1, __global const double* data_2,
+    __global double* channel_dot);
diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl
new file mode 100644
index 00000000..731f660c
--- /dev/null
+++ b/src/caffe/ocl/softmaxwithloss_layer.cl
@@ -0,0 +1,103 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void SoftmaxLossForwardGPU(const int nthreads,
+    __global T* prob_data, __global T* label,__global T* loss,
+    int num, int dim, int spatial_dim,
+    bool has_ignore_label_, int ignore_label_,
+    __global T* counts) {
+  int index = get_global_id(0);
+  if(index < nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      loss[index] = 0;
+      counts[index] = 0;
+    } else {
+      loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
+              T(FLT_MIN)));
+      counts[index] = 1;
+    }
+  }
+}
+
+template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_float))) __kernel void SoftmaxLossForwardGPU(int nthreads,
+    __global float* prob_data, __global float* label,__global float* loss,
+    int num, int dim, int spatial_dim,
+    bool has_ignore_label_, int ignore_label_,
+    __global float* counts);
+template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_double))) __kernel void SoftmaxLossForwardGPU(int nthreads,
+    __global double* prob_data, __global double* label,__global double* loss,
+    int num, int dim, int spatial_dim,
+    bool has_ignore_label_, int ignore_label_,
+    __global double* counts);
+
+template <class T>
+__kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top,
+    __global T* label,__global T* bottom_diff, int num, int dim,
+    int spatial_dim, bool has_ignore_label_,
+    int ignore_label_, T* counts) {
+  const int channels = dim / spatial_dim;
+  int index = get_global_id(0);
+  if(index < nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      for (int c = 0; c < channels; ++c) {
+        bottom_diff[n * dim + c * spatial_dim + s] = 0;
+      }
+      counts[index] = 0;
+    } else {
+      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
+      counts[index] = 1;
+    }
+  }
+}
+template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top,
+    __global float* label,__global float* bottom_diff, int num, int dim,
+    int spatial_dim, bool has_ignore_label_,
+    int ignore_label_, float* counts);
+
+template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top,
+    __global double* label,__global double* bottom_diff, int num, int dim,
+    int spatial_dim, bool has_ignore_label_,
+    int ignore_label_, double* counts);
+
+template <class T>
+__kernel void scal (const int num, const T alpha, __global T* data) {
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  for(index; index < num; index += total) {
+    data[index] = data[index] * alpha;
+  }
+}
+
+template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data);
+template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha, __global double* data);
diff --git a/src/caffe/ocl/tanh_layer.cl b/src/caffe/ocl/tanh_layer.cl
new file mode 100644
index 00000000..900f11ea
--- /dev/null
+++ b/src/caffe/ocl/tanh_layer.cl
@@ -0,0 +1,46 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void TanHForward(const int count, __global T* in, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] =tanh(in[index]);
+}
+
+template __attribute__ ((mangled_name(TanHForward_float))) __kernel void TanHForward(const int count, __global float* in, __global float* out);
+template __attribute__ ((mangled_name(TanHForward_double))) __kernel void TanHForward(const int count, __global double* in, __global double* out);
+
+template <class T>
+__kernel void TanHBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) {
+  int index = get_global_id(0);
+  const T tanhx = out_data[index];
+  if(index < count)
+  out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx);
+}
+
+template __attribute__ ((mangled_name(TanHBackward_float))) __kernel void TanHBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff);
+template __attribute__ ((mangled_name(TanHBackward_double))) __kernel void TanHBackward(const int count, __global double* in_diff, __global double* out_data, __global double* out_diff);
diff --git a/src/caffe/ocl/threshold_layer.cl b/src/caffe/ocl/threshold_layer.cl
new file mode 100644
index 00000000..679dbf29
--- /dev/null
+++ b/src/caffe/ocl/threshold_layer.cl
@@ -0,0 +1,36 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+template <class T>
+__kernel void ThresholdForward(const int count, const T threshold, __global T* in, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] =in[index] > threshold ? 1 : 0;
+}
+
+template __attribute__ ((mangled_name(ThresholdForward_float))) __kernel void ThresholdForward(const int count, const float threshold, __global float* in, __global float* out);
+template __attribute__ ((mangled_name(ThresholdForward_double))) __kernel void ThresholdForward(const int count, const double threshold, __global double* in, __global double* out);
+
diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl
new file mode 100644
index 00000000..222e4ed9
--- /dev/null
+++ b/src/caffe/ocl/util.cl
@@ -0,0 +1,268 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+template <class T>
+__kernel void OCL_memset(__global T* buffer, const T value, const int size, const int buf_offset) {
+  int gdx = get_global_id(0);
+  buffer += buf_offset;
+  if(gdx < size) {
+    buffer[gdx] = value;
+  }
+}
+
+template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__global int* buffer, const int value, const int size, const int buf_offset);
+template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size, const int buf_offset);
+template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size, const int buf_offset);
+
+__kernel void OCL_memset2(__global int* buffer, const int value, const int size) {
+  int gdx = get_global_id(0);
+  if(gdx < size) {
+    buffer[gdx] = value;
+  }
+}
+
+template <class T>
+__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y) {
+  int gdx = get_global_id(0);
+  if(gdx < N) {
+    Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0));
+  }
+}
+
+template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y);
+template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y);
+
+template <class T>
+__kernel void caffe_gpu_sgnbit(const int N, __global T* X, __global T* Y) {
+  int gdx = get_global_id(0);
+  if(gdx < N) {
+    Y[gdx] =(X[gdx] < 0.0);
+  }
+}
+
+template __attribute__((mangled_name(caffe_gpu_sgnbit_float))) __kernel void caffe_gpu_sgnbit(const int N, __global float* X, __global float* Y);
+template __attribute__((mangled_name(caffe_gpu_sgnbit_double))) __kernel void caffe_gpu_sgnbit(const int N, __global double* X, __global double* Y);
+
+template <class T>
+__kernel void caffe_gpu_sign_with_offset(const int N, __global T* X, const int offx,  __global T* Y, const int offy) {
+  X += offx;
+  Y += offy;
+  int gdx = get_global_id(0);
+  if(gdx < N) {
+    Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0));
+  }
+}
+template __attribute__((mangled_name(caffe_gpu_sign_with_offset_float))) __kernel void caffe_gpu_sign_with_offset(const int N, __global float* X, const int offx,  __global float* Y, const int offy);
+template __attribute__((mangled_name(caffe_gpu_sign_with_offset_double))) __kernel void caffe_gpu_sign_with_offset(const int N, __global double* X, const int offx,  __global double* Y, const int offy);
+
+template <class T>
+__kernel void caffe_gpu_abs(const int n, __global T* a, __global T* y) {
+  int index = get_global_id(0);
+  if(index < n) {
+    y[index] = fabs(a[index]);
+  }
+}
+template __attribute__((mangled_name(caffe_gpu_abs_float))) __kernel void caffe_gpu_abs(const int n, __global float* a, __global float* Y);
+template __attribute__((mangled_name(caffe_gpu_abs_double))) __kernel void caffe_gpu_abs(const int n, __global double* a, __global double* Y);
+
+template <class T>
+__kernel void get_max(const int num, const int dim, __global T* data, __global T* out) {
+  int index = get_global_id(0);
+  if (index < num) {
+    T maxval = -FLT_MAX;
+    for (int i = 0; i < dim; i++)
+    maxval = max( data[index*dim + i], maxval );
+    out[index] = maxval;
+  }
+}
+
+template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out);
+template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out);
+
+template <class T>
+__kernel void exp (const int num, __global T* data, __global T* out) {
+  int index = get_global_id(0);
+  if (index < num)
+  out[index] = exp(data[index]);
+}
+
+template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out);
+template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out);
+
+template <class T>
+__kernel void kernel_sub(const int count, __global const T* a, __global const T* b, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = a[index] - b[index];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_sub_float))) __kernel void kernel_sub(const int count, __global const float* a, __global const float* b, __global float* out);
+template __attribute__ ((mangled_name(kernel_sub_double))) __kernel void kernel_sub(const int count, __global const double* a, __global const double* b, __global double* out);
+
+template <class T>
+__kernel void kernel_add(const int count, __global const T* a, __global const T* b, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = a[index] + b[index];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_add_float))) __kernel void kernel_add(const int count, __global const float* a, __global const float* b, __global float* out);
+template __attribute__ ((mangled_name(kernel_add_double))) __kernel void kernel_add(const int count, __global const double* a, __global const double* b, __global double* out);
+
+template <class T>
+__kernel void kernel_div(const int count, __global const T* a, __global const T* b, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = a[index] / b[index];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_div_float))) __kernel void kernel_div(const int count, __global const float* a, __global const float* b, __global float* out);
+template __attribute__ ((mangled_name(kernel_div_double))) __kernel void kernel_div(const int count, __global const double* a, __global const double* b, __global double* out);
+
+template <class T>
+__kernel void kernel_mul(const int count, __global const T* a, __global const T* b, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = a[index] * b[index];
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_mul_float))) __kernel void kernel_mul(const int count, __global const float* a, __global const float* b, __global float* out);
+template __attribute__ ((mangled_name(kernel_mul_double))) __kernel void kernel_mul(const int count, __global const double* a, __global const double* b, __global double* out);
+
+template <class T>
+__kernel void kernel_powx(const int count, __global const T* data, const T alpha, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = pow(data[index], alpha);
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_powx_float))) __kernel void kernel_powx(const int count, __global const float* data, const float alpha, __global float* out);
+template __attribute__ ((mangled_name(kernel_powx_double))) __kernel void kernel_powx(const int count, __global const double* data, const double alpha, __global double* out);
+
+template <class T>
+__kernel void kernel_exp(const int count, __global const T* data, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = exp(data[index]);
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out);
+template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out);
+
+template <class T>
+__kernel void kernel_add_scalar(const int count, const T data, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = out[index] + data;
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_add_scalar_float))) __kernel void kernel_add_scalar(const int count, const float data, __global float* out);
+template __attribute__ ((mangled_name(kernel_add_scalar_double))) __kernel void kernel_add_scalar(const int count, const double data, __global double* out);
+
+template <class T>
+__kernel void kernel_log(const int count, __global const T* data, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count) {
+    out[index] = log(data[index]);
+  }
+}
+
+template __attribute__ ((mangled_name(kernel_log_float))) __kernel void kernel_log(const int count, __global const float* data, __global float* out);
+template __attribute__ ((mangled_name(kernel_log_double))) __kernel void kernel_log(const int count, __global const double* data, __global double* out);
+
+template <class T>
+__kernel void diff (const int num, const int dim, __global T* data, __global T* label) {
+  int index = get_global_id(0);
+  int total = get_global_size(0);
+  int offset;
+  for(index; index < num; index += total) {
+    offset = (int) label[index];
+    data[index * dim + offset] -= 1;
+  }
+}
+
+template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label);
+template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label);
+
+template <class T>
+__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y) {
+  int index = get_global_id(0);
+  if (index < n)
+  y[index] = a[index] / b[index];
+}
+
+template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y);
+//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y);
+
+template <class T>
+__kernel void add_scalar (const int n, const T alpha, __global T* y) {
+  int index = get_global_id(0);
+  if (index < n)
+  y[index] += alpha;
+}
+
+template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y);
+template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y);
+
+template <typename Dtype>
+__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) {
+  int index = get_global_id(0);
+  if (index < n)
+  y[index] = in1[index] + in2[index];
+}
+template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y);
+template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y);
+
+template <class T>
+__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y) {
+  int index = get_global_id(0);
+  if (index < n)
+  y[index] = a[index] * b[index];
+}
+
+template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y);
+template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y);
+
+template <class T>
+__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y) {
+  int index = get_global_id(0);
+  if (index < n)
+//           y[index] = a[index] + alpha;
+  y[index] = pow(a[index], alpha);
+}
+
+template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y);
+template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y);
+
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index aabe0ede..b9ed1050 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -10,7 +10,7 @@
 #include "caffe/util/io.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/upgrade_proto.hpp"
-
+#include "caffe/util/ocl_wrapper.hpp"
 namespace caffe {
 
 template <typename Dtype>
@@ -30,9 +30,10 @@ Solver<Dtype>::Solver(const string& param_file)
 template <typename Dtype>
 void Solver<Dtype>::Init(const SolverParameter& param) {
   LOG(INFO) << "Initializing solver from parameters: " << std::endl
-            << param.DebugString();
+      << param.DebugString();
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
+
   if (param_.random_seed() >= 0) {
     Caffe::set_random_seed(param_.random_seed());
   }
@@ -46,8 +47,8 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
 
 template <typename Dtype>
 void Solver<Dtype>::InitTrainNet() {
-  const int num_train_nets = param_.has_net() + param_.has_net_param() +
-      param_.has_train_net() + param_.has_train_net_param();
+  const int num_train_nets = param_.has_net() + param_.has_net_param()
+      + param_.has_train_net() + param_.has_train_net_param();
   const string& field_names = "net, net_param, train_net, train_net_param";
   CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net "
       << "using one of these fields: " << field_names;
@@ -59,7 +60,7 @@ void Solver<Dtype>::InitTrainNet() {
     net_param.CopyFrom(param_.train_net_param());
   } else if (param_.has_train_net()) {
     LOG(INFO) << "Creating training net from train_net file: "
-              << param_.train_net();
+        << param_.train_net();
     ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param);
   }
   if (param_.has_net_param()) {
@@ -93,11 +94,11 @@ void Solver<Dtype>::InitTestNets() {
   const int num_test_net_files = param_.test_net_size();
   const int num_test_nets = num_test_net_params + num_test_net_files;
   if (num_generic_nets) {
-      CHECK_GE(param_.test_iter_size(), num_test_nets)
-          << "test_iter must be specified for each test network.";
+    CHECK_GE(param_.test_iter_size(), num_test_nets)
+        << "test_iter must be specified for each test network.";
   } else {
-      CHECK_EQ(param_.test_iter_size(), num_test_nets)
-          << "test_iter must be specified for each test network.";
+    CHECK_EQ(param_.test_iter_size(), num_test_nets)
+        << "test_iter must be specified for each test network.";
   }
   // If we have a generic net (specified by net or net_param, rather than
   // test_net or test_net_param), we may have an unlimited number of actual
@@ -114,16 +115,16 @@ void Solver<Dtype>::InitTestNets() {
     CHECK_GT(param_.test_interval(), 0);
   }
   int test_net_id = 0;
-  vector<string> sources(num_test_net_instances);
-  vector<NetParameter> net_params(num_test_net_instances);
+  vector < string > sources(num_test_net_instances);
+  vector < NetParameter > net_params(num_test_net_instances);
   for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) {
-      sources[test_net_id] = "test_net_param";
-      net_params[test_net_id].CopyFrom(param_.test_net_param(i));
+    sources[test_net_id] = "test_net_param";
+    net_params[test_net_id].CopyFrom(param_.test_net_param(i));
   }
   for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) {
-      sources[test_net_id] = "test_net file: " + param_.test_net(i);
-      ReadNetParamsFromTextFileOrDie(param_.test_net(i),
-          &net_params[test_net_id]);
+    sources[test_net_id] = "test_net file: " + param_.test_net(i);
+    ReadNetParamsFromTextFileOrDie(param_.test_net(i),
+        &net_params[test_net_id]);
   }
   const int remaining_test_nets = param_.test_iter_size() - test_net_id;
   if (has_net_param) {
@@ -151,8 +152,7 @@ void Solver<Dtype>::InitTestNets() {
       net_state.MergeFrom(param_.test_state(i));
     }
     net_params[i].mutable_state()->CopyFrom(net_state);
-    LOG(INFO)
-        << "Creating test net (#" << i << ") specified by " << sources[i];
+    LOG(INFO) << "Creating test net (#" << i << ") specified by " << sources[i];
     test_nets_[i].reset(new Net<Dtype>(net_params[i]));
     test_nets_[i]->set_debug_info(param_.debug_info());
   }
@@ -164,13 +164,13 @@ void Solver<Dtype>::Step(int iters) {
   const int start_iter = iter_;
   const int stop_iter = iter_ + iters;
   int average_loss = this->param_.average_loss();
-  vector<Dtype> losses;
+  vector < Dtype > losses;
   Dtype smoothed_loss = 0;
 
   while (iter_ < stop_iter) {
     // zero-init the params
     for (int i = 0; i < net_->params().size(); ++i) {
-      shared_ptr<Blob<Dtype> > blob = net_->params()[i];
+      shared_ptr < Blob<Dtype> > blob = net_->params()[i];
       switch (Caffe::mode()) {
       case Caffe::CPU:
         caffe_set(blob->count(), static_cast<Dtype>(0),
@@ -182,6 +182,13 @@ void Solver<Dtype>::Step(int iters) {
             blob->mutable_gpu_diff());
 #else
         NO_GPU;
+#endif
+      case Caffe::APU:
+#ifndef CPU_ONLY
+        caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
+            blob->mutable_gpu_diff());
+#else
+        NO_GPU;
 #endif
         break;
       }
@@ -223,12 +230,11 @@ void Solver<Dtype>::Step(int iters) {
         for (int k = 0; k < result[j]->count(); ++k) {
           ostringstream loss_msg_stream;
           if (loss_weight) {
-            loss_msg_stream << " (* " << loss_weight
-                            << " = " << loss_weight * result_vec[k] << " loss)";
+            loss_msg_stream << " (* " << loss_weight << " = "
+                << loss_weight * result_vec[k] << " loss)";
           }
-          LOG(INFO) << "    Train net output #"
-              << score_index++ << ": " << output_name << " = "
-              << result_vec[k] << loss_msg_stream.str();
+          LOG(INFO) << "    Train net output #" << score_index++ << ": "
+              << output_name << " = " << result_vec[k] << loss_msg_stream.str();
         }
       }
     }
@@ -281,7 +287,6 @@ void Solver<Dtype>::Solve(const char* resume_file) {
   LOG(INFO) << "Optimization Done.";
 }
 
-
 template <typename Dtype>
 void Solver<Dtype>::TestAll() {
   for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) {
@@ -291,19 +296,19 @@ void Solver<Dtype>::TestAll() {
 
 template <typename Dtype>
 void Solver<Dtype>::Test(const int test_net_id) {
-  LOG(INFO) << "Iteration " << iter_
-            << ", Testing net (#" << test_net_id << ")";
-  CHECK_NOTNULL(test_nets_[test_net_id].get())->
-      ShareTrainedLayersWith(net_.get());
-  vector<Dtype> test_score;
+  LOG(INFO) << "Iteration " << iter_ << ", Testing net (#" << test_net_id
+      << ")";
+  CHECK_NOTNULL(test_nets_[test_net_id].get())->ShareTrainedLayersWith(
+      net_.get());
+  vector < Dtype > test_score;
   vector<int> test_score_output_id;
   vector<Blob<Dtype>*> bottom_vec;
   const shared_ptr<Net<Dtype> >& test_net = test_nets_[test_net_id];
   Dtype loss = 0;
   for (int i = 0; i < param_.test_iter(test_net_id); ++i) {
     Dtype iter_loss;
-    const vector<Blob<Dtype>*>& result =
-        test_net->Forward(bottom_vec, &iter_loss);
+    const vector<Blob<Dtype>*>& result = test_net->Forward(bottom_vec,
+        &iter_loss);
     if (param_.test_compute_loss()) {
       loss += iter_loss;
     }
@@ -337,15 +342,14 @@ void Solver<Dtype>::Test(const int test_net_id) {
     ostringstream loss_msg_stream;
     const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id);
     if (loss_weight) {
-      loss_msg_stream << " (* " << loss_weight
-                      << " = " << loss_weight * mean_score << " loss)";
+      loss_msg_stream << " (* " << loss_weight << " = "
+          << loss_weight * mean_score << " loss)";
     }
     LOG(INFO) << "    Test net output #" << i << ": " << output_name << " = "
         << mean_score << loss_msg_stream.str();
   }
 }
 
-
 template <typename Dtype>
 void Solver<Dtype>::Snapshot() {
   NetParameter net_param;
@@ -384,7 +388,6 @@ void Solver<Dtype>::Restore(const char* state_file) {
   RestoreSolverState(state);
 }
 
-
 // Return the current learning rate. The currently implemented learning rate
 // policies are as follows:
 //    - fixed: always return base_lr.
@@ -408,31 +411,36 @@ Dtype SGDSolver<Dtype>::GetLearningRate() {
     rate = this->param_.base_lr();
   } else if (lr_policy == "step") {
     this->current_step_ = this->iter_ / this->param_.stepsize();
-    rate = this->param_.base_lr() *
-        pow(this->param_.gamma(), this->current_step_);
+    rate = this->param_.base_lr()
+        * pow(this->param_.gamma(), this->current_step_);
   } else if (lr_policy == "exp") {
     rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_);
   } else if (lr_policy == "inv") {
-    rate = this->param_.base_lr() *
-        pow(Dtype(1) + this->param_.gamma() * this->iter_,
-            - this->param_.power());
+    rate = this->param_.base_lr()
+        * pow(Dtype(1) + this->param_.gamma() * this->iter_,
+            -this->param_.power());
   } else if (lr_policy == "multistep") {
-    if (this->current_step_ < this->param_.stepvalue_size() &&
-          this->iter_ >= this->param_.stepvalue(this->current_step_)) {
+    if (this->current_step_ < this->param_.stepvalue_size()
+        && this->iter_ >= this->param_.stepvalue(this->current_step_)) {
       this->current_step_++;
-      LOG(INFO) << "MultiStep Status: Iteration " <<
-      this->iter_ << ", step = " << this->current_step_;
+      LOG(INFO) << "MultiStep Status: Iteration " << this->iter_ << ", step = "
+          << this->current_step_;
     }
-    rate = this->param_.base_lr() *
-        pow(this->param_.gamma(), this->current_step_);
+    rate = this->param_.base_lr()
+        * pow(this->param_.gamma(), this->current_step_);
   } else if (lr_policy == "poly") {
-    rate = this->param_.base_lr() * pow(Dtype(1.) -
-        (Dtype(this->iter_) / Dtype(this->param_.max_iter())),
-        this->param_.power());
+    rate = this->param_.base_lr()
+        * pow(Dtype(1.) - (Dtype(this->iter_) / Dtype(this->param_.max_iter())),
+            this->param_.power());
   } else if (lr_policy == "sigmoid") {
-    rate = this->param_.base_lr() * (Dtype(1.) /
-        (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) -
-          Dtype(this->param_.stepsize())))));
+    rate =
+        this->param_.base_lr()
+            * (Dtype(1.)
+                / (Dtype(1.)
+                    + exp(
+                        -this->param_.gamma()
+                            * (Dtype(this->iter_)
+                                - Dtype(this->param_.stepsize())))));
   } else {
     LOG(FATAL) << "Unknown learning rate policy: " << lr_policy;
   }
@@ -448,16 +456,18 @@ void SGDSolver<Dtype>::PreSolve() {
   temp_.clear();
   for (int i = 0; i < net_params.size(); ++i) {
     const vector<int>& shape = net_params[i]->shape();
-    history_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-    update_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
-    temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+    history_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
+    update_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
+    temp_.push_back(shared_ptr < Blob<Dtype> > (new Blob<Dtype>(shape)));
   }
 }
 
 template <typename Dtype>
 void SGDSolver<Dtype>::ClipGradients() {
   const Dtype clip_gradients = this->param_.clip_gradients();
-  if (clip_gradients < 0) { return; }
+  if (clip_gradients < 0) {
+    return;
+  }
   const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
   Dtype sumsq_diff = 0;
   for (int i = 0; i < net_params.size(); ++i) {
@@ -469,8 +479,8 @@ void SGDSolver<Dtype>::ClipGradients() {
   if (l2norm_diff > clip_gradients) {
     Dtype scale_factor = clip_gradients / l2norm_diff;
     LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm "
-        << l2norm_diff << " > " << clip_gradients << ") "
-        << "by scale factor " << scale_factor;
+        << l2norm_diff << " > " << clip_gradients << ") " << "by scale factor "
+        << scale_factor;
     for (int i = 0; i < net_params.size(); ++i) {
       if (this->net_->param_owners()[i] < 0) {
         net_params[i]->scale_diff(scale_factor);
@@ -496,7 +506,9 @@ void SGDSolver<Dtype>::ApplyUpdate() {
 
 template <typename Dtype>
 void SGDSolver<Dtype>::Normalize(int param_id) {
-  if (this->param_.iter_size() == 1) { return; }
+  if (this->param_.iter_size() == 1) {
+    return;
+  }
   // Scale gradient to counterbalance accumulation.
   const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
   const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
@@ -528,21 +540,20 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
   Dtype weight_decay = this->param_.weight_decay();
   string regularization_type = this->param_.regularization_type();
   Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+
   switch (Caffe::mode()) {
   case Caffe::CPU: {
     if (local_decay) {
       if (regularization_type == "L2") {
         // add weight decay
-        caffe_axpy(net_params[param_id]->count(),
-            local_decay,
+        caffe_axpy(net_params[param_id]->count(), local_decay,
             net_params[param_id]->cpu_data(),
             net_params[param_id]->mutable_cpu_diff());
       } else if (regularization_type == "L1") {
         caffe_cpu_sign(net_params[param_id]->count(),
             net_params[param_id]->cpu_data(),
             temp_[param_id]->mutable_cpu_data());
-        caffe_axpy(net_params[param_id]->count(),
-            local_decay,
+        caffe_axpy(net_params[param_id]->count(), local_decay,
             temp_[param_id]->cpu_data(),
             net_params[param_id]->mutable_cpu_diff());
       } else {
@@ -556,16 +567,14 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
     if (local_decay) {
       if (regularization_type == "L2") {
         // add weight decay
-        caffe_gpu_axpy(net_params[param_id]->count(),
-            local_decay,
+        caffe_gpu_axpy(net_params[param_id]->count(), local_decay,
             net_params[param_id]->gpu_data(),
             net_params[param_id]->mutable_gpu_diff());
       } else if (regularization_type == "L1") {
         caffe_gpu_sign(net_params[param_id]->count(),
             net_params[param_id]->gpu_data(),
             temp_[param_id]->mutable_gpu_data());
-        caffe_gpu_axpy(net_params[param_id]->count(),
-            local_decay,
+        caffe_gpu_axpy(net_params[param_id]->count(), local_decay,
             temp_[param_id]->gpu_data(),
             net_params[param_id]->mutable_gpu_diff());
       } else {
@@ -592,19 +601,18 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   switch (Caffe::mode()) {
   case Caffe::CPU: {
     caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->cpu_diff(), momentum,
-              history_[param_id]->mutable_cpu_data());
-    caffe_copy(net_params[param_id]->count(),
-        history_[param_id]->cpu_data(),
+        net_params[param_id]->cpu_diff(), momentum,
+        history_[param_id]->mutable_cpu_data());
+    caffe_copy(net_params[param_id]->count(), history_[param_id]->cpu_data(),
         net_params[param_id]->mutable_cpu_diff());
     break;
   }
   case Caffe::GPU: {
 #ifndef CPU_ONLY
     caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->gpu_diff(), momentum,
-              history_[param_id]->mutable_gpu_data());
-    caffe_copy(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), momentum,
+        history_[param_id]->mutable_gpu_data());
+    caffe_gpu_copy(net_params[param_id]->count(),
         history_[param_id]->gpu_data(),
         net_params[param_id]->mutable_gpu_diff());
 #else
@@ -652,8 +660,8 @@ void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 
     // update history
     caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->cpu_diff(), momentum,
-              this->history_[param_id]->mutable_cpu_data());
+        net_params[param_id]->cpu_diff(), momentum,
+        this->history_[param_id]->mutable_cpu_data());
 
     // compute update: step back then over step
     caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
@@ -669,14 +677,14 @@ void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   case Caffe::GPU: {
 #ifndef CPU_ONLY
     // save history momentum for stepping back
-    caffe_copy(net_params[param_id]->count(),
+    caffe_gpu_copy(net_params[param_id]->count(),
         this->history_[param_id]->gpu_data(),
         this->update_[param_id]->mutable_gpu_data());
 
     // update history
     caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->gpu_diff(), momentum,
-              this->history_[param_id]->mutable_gpu_data());
+        net_params[param_id]->gpu_diff(), momentum,
+        this->history_[param_id]->mutable_gpu_data());
 
     // compute update: step back then over step
     caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
@@ -684,7 +692,7 @@ void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
         this->update_[param_id]->mutable_gpu_data());
 
     // copy
-    caffe_copy(net_params[param_id]->count(),
+    caffe_gpu_copy(net_params[param_id]->count(),
         this->update_[param_id]->gpu_data(),
         net_params[param_id]->mutable_gpu_diff());
 #else
@@ -706,9 +714,8 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   switch (Caffe::mode()) {
   case Caffe::CPU: {
     // compute square of gradient in update
-    caffe_powx(net_params[param_id]->count(),
-        net_params[param_id]->cpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_cpu_data());
+    caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(),
+        Dtype(2), this->update_[param_id]->mutable_cpu_data());
 
     // update history
     caffe_add(net_params[param_id]->count(),
@@ -718,16 +725,15 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 
     // prepare update
     caffe_powx(net_params[param_id]->count(),
-              this->history_[param_id]->cpu_data(), Dtype(0.5),
-              this->update_[param_id]->mutable_cpu_data());
+        this->history_[param_id]->cpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_cpu_data());
 
-    caffe_add_scalar(net_params[param_id]->count(),
-              delta, this->update_[param_id]->mutable_cpu_data());
+    caffe_add_scalar(net_params[param_id]->count(), delta,
+        this->update_[param_id]->mutable_cpu_data());
 
-    caffe_div(net_params[param_id]->count(),
-              net_params[param_id]->cpu_diff(),
-              this->update_[param_id]->cpu_data(),
-              this->update_[param_id]->mutable_cpu_data());
+    caffe_div(net_params[param_id]->count(), net_params[param_id]->cpu_diff(),
+        this->update_[param_id]->cpu_data(),
+        this->update_[param_id]->mutable_cpu_data());
 
     // scale and copy
     caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
@@ -750,16 +756,15 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 
     // prepare update
     caffe_gpu_powx(net_params[param_id]->count(),
-              this->history_[param_id]->gpu_data(), Dtype(0.5),
-              this->update_[param_id]->mutable_gpu_data());
+        this->history_[param_id]->gpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_gpu_data());
 
-    caffe_gpu_add_scalar(net_params[param_id]->count(),
-              delta, this->update_[param_id]->mutable_gpu_data());
+    caffe_gpu_add_scalar < Dtype
+        > (net_params[param_id]->count(), delta, this->update_[param_id]->mutable_gpu_data());
 
     caffe_gpu_div(net_params[param_id]->count(),
-              net_params[param_id]->gpu_diff(),
-              this->update_[param_id]->gpu_data(),
-              this->update_[param_id]->mutable_gpu_data());
+        net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(),
+        this->update_[param_id]->mutable_gpu_data());
 
     // scale and copy
     caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
@@ -775,9 +780,9 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   }
 }
 
-INSTANTIATE_CLASS(Solver);
-INSTANTIATE_CLASS(SGDSolver);
-INSTANTIATE_CLASS(NesterovSolver);
-INSTANTIATE_CLASS(AdaGradSolver);
+INSTANTIATE_CLASS (Solver);
+INSTANTIATE_CLASS (SGDSolver);
+INSTANTIATE_CLASS (NesterovSolver);
+INSTANTIATE_CLASS (AdaGradSolver);
 
 }  // namespace caffe
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 7617ccfb..76d3f2ea 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -1,43 +1,104 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #include <cstring>
 
 #include "caffe/common.hpp"
 #include "caffe/syncedmem.hpp"
 #include "caffe/util/math_functions.hpp"
+#include "caffe/util/ocl_util.hpp"
+
+#define CL_MEM_USE_PERSISTENT_MEM_AMD (1 << 6)//specific for AMD devices
 
 namespace caffe {
 
 SyncedMemory::~SyncedMemory() {
+#ifndef CPU_ONLY
   if (cpu_ptr_ && own_cpu_data_) {
-    CaffeFreeHost(cpu_ptr_);
+    OCL_CHECK(
+        clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
+            cpu_ptr_, 0, NULL, NULL));
+    clFinish(amdDevice.CommandQueue);
+  }
+  if (gpu_cache_ptr_ && own_cpu_data_) {
+    OCL_CHECK(clReleaseMemObject((cl_mem) gpu_cache_ptr_));
   }
-
-#ifndef CPU_ONLY
   if (gpu_ptr_) {
-    CUDA_CHECK(cudaFree(gpu_ptr_));
+    OCL_CHECK(clReleaseMemObject((cl_mem) gpu_ptr_));
   }
-#endif  // CPU_ONLY
+
+  clReleaseKernel (oclmem_kernel);
+#endif
+}
+
+//begin: code written/modified by AMD.
+#ifndef CPU_ONLY
+void SyncedMemory::ocl_setup() {
+  cl_int err = 0;
+  oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err);
+  OCL_CHECK(err);
 }
+#endif
 
 inline void SyncedMemory::to_cpu() {
   switch (head_) {
   case UNINITIALIZED:
+#ifndef CPU_ONLY
+    gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
+        size_, NULL, NULL);
+    cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
+        (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_,
+        0, NULL, NULL, NULL);
+#else
     CaffeMallocHost(&cpu_ptr_, size_);
-    caffe_memset(size_, 0, cpu_ptr_);
+#endif
+    memset(cpu_ptr_, 0, size_);
     head_ = HEAD_AT_CPU;
     own_cpu_data_ = true;
     break;
-  case HEAD_AT_GPU:
+  case HEAD_AT_GPU: {
 #ifndef CPU_ONLY
     if (cpu_ptr_ == NULL) {
-      CaffeMallocHost(&cpu_ptr_, size_);
+      gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR,
+          size_, NULL, NULL);
+      cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue,
+          (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
+          size_, 0, NULL, NULL, NULL);
       own_cpu_data_ = true;
     }
-    caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
+    OCL_CHECK(
+        clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_,
+            (cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL));
+    clFinish(amdDevice.CommandQueue);
     head_ = SYNCED;
 #else
     NO_GPU;
 #endif
     break;
+  }
   case HEAD_AT_CPU:
   case SYNCED:
     break;
@@ -47,18 +108,34 @@ inline void SyncedMemory::to_cpu() {
 inline void SyncedMemory::to_gpu() {
 #ifndef CPU_ONLY
   switch (head_) {
-  case UNINITIALIZED:
-    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
-    caffe_gpu_memset(size_, 0, gpu_ptr_);
+  case UNINITIALIZED: {
+    cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_,
+        NULL, NULL);
+    if (NULL == tmpMem) {
+      fprintf(stderr, "Failed to create memory object\n");
+      break;
+    }
+    ocl_memset(tmpMem, (int) 0, (int) (size_ / sizeof(int)));
+    gpu_ptr_ = (void*) tmpMem;
     head_ = HEAD_AT_GPU;
     break;
-  case HEAD_AT_CPU:
+  }
+  case HEAD_AT_CPU: {
     if (gpu_ptr_ == NULL) {
-      CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
+      cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+          size_, NULL, NULL);
+      if (NULL == tmpMem) {
+        fprintf(stderr, "Failed to create memory object\n");
+      }
+      gpu_ptr_ = (void*) tmpMem;
     }
-    caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
+    OCL_CHECK(
+        clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_,
+            (cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL));
+    clFinish(amdDevice.CommandQueue);
     head_ = SYNCED;
     break;
+  }
   case HEAD_AT_GPU:
   case SYNCED:
     break;
@@ -70,13 +147,13 @@ inline void SyncedMemory::to_gpu() {
 
 const void* SyncedMemory::cpu_data() {
   to_cpu();
-  return (const void*)cpu_ptr_;
+  return (const void*) cpu_ptr_;
 }
 
 void SyncedMemory::set_cpu_data(void* data) {
   CHECK(data);
   if (own_cpu_data_) {
-    CaffeFreeHost(cpu_ptr_);
+    CaffeFreeHost (cpu_ptr_);
   }
   cpu_ptr_ = data;
   head_ = HEAD_AT_CPU;
@@ -86,7 +163,7 @@ void SyncedMemory::set_cpu_data(void* data) {
 const void* SyncedMemory::gpu_data() {
 #ifndef CPU_ONLY
   to_gpu();
-  return (const void*)gpu_ptr_;
+  return (const void*) gpu_ptr_;
 #else
   NO_GPU;
 #endif
@@ -108,6 +185,9 @@ void* SyncedMemory::mutable_gpu_data() {
 #endif
 }
 
+const void *SyncedMemory::gpu_cache_data() {
+  return 0;
+}
 
 }  // namespace caffe
 
diff --git a/src/caffe/test/Makefile b/src/caffe/test/Makefile
new file mode 100644
index 00000000..c9e785c7
--- /dev/null
+++ b/src/caffe/test/Makefile
@@ -0,0 +1,1766 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 2.8
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The program to use to edit the cache.
+CMAKE_EDIT_COMMAND = /usr/bin/ccmake
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
+	/usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: install/local
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: install/strip
+.PHONY : install/strip/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/progress.marks
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+src/caffe/test/CMakeFiles/runtest.dir/rule:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/CMakeFiles/runtest.dir/rule
+.PHONY : src/caffe/test/CMakeFiles/runtest.dir/rule
+
+# Convenience name for target.
+runtest: src/caffe/test/CMakeFiles/runtest.dir/rule
+.PHONY : runtest
+
+# fast build rule for target.
+runtest/fast:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/runtest.dir/build.make src/caffe/test/CMakeFiles/runtest.dir/build
+.PHONY : runtest/fast
+
+# Convenience name for target.
+src/caffe/test/CMakeFiles/test.testbin.dir/rule:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/CMakeFiles/test.testbin.dir/rule
+.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/rule
+
+# Convenience name for target.
+test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/rule
+.PHONY : test.testbin
+
+# fast build rule for target.
+test.testbin/fast:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/build
+.PHONY : test.testbin/fast
+
+test_accuracy_layer.o: test_accuracy_layer.cpp.o
+.PHONY : test_accuracy_layer.o
+
+# target to build an object file
+test_accuracy_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o
+.PHONY : test_accuracy_layer.cpp.o
+
+test_accuracy_layer.i: test_accuracy_layer.cpp.i
+.PHONY : test_accuracy_layer.i
+
+# target to preprocess a source file
+test_accuracy_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i
+.PHONY : test_accuracy_layer.cpp.i
+
+test_accuracy_layer.s: test_accuracy_layer.cpp.s
+.PHONY : test_accuracy_layer.s
+
+# target to generate assembly for a file
+test_accuracy_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s
+.PHONY : test_accuracy_layer.cpp.s
+
+test_argmax_layer.o: test_argmax_layer.cpp.o
+.PHONY : test_argmax_layer.o
+
+# target to build an object file
+test_argmax_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o
+.PHONY : test_argmax_layer.cpp.o
+
+test_argmax_layer.i: test_argmax_layer.cpp.i
+.PHONY : test_argmax_layer.i
+
+# target to preprocess a source file
+test_argmax_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i
+.PHONY : test_argmax_layer.cpp.i
+
+test_argmax_layer.s: test_argmax_layer.cpp.s
+.PHONY : test_argmax_layer.s
+
+# target to generate assembly for a file
+test_argmax_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s
+.PHONY : test_argmax_layer.cpp.s
+
+test_benchmark.o: test_benchmark.cpp.o
+.PHONY : test_benchmark.o
+
+# target to build an object file
+test_benchmark.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o
+.PHONY : test_benchmark.cpp.o
+
+test_benchmark.i: test_benchmark.cpp.i
+.PHONY : test_benchmark.i
+
+# target to preprocess a source file
+test_benchmark.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.i
+.PHONY : test_benchmark.cpp.i
+
+test_benchmark.s: test_benchmark.cpp.s
+.PHONY : test_benchmark.s
+
+# target to generate assembly for a file
+test_benchmark.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.s
+.PHONY : test_benchmark.cpp.s
+
+test_blob.o: test_blob.cpp.o
+.PHONY : test_blob.o
+
+# target to build an object file
+test_blob.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o
+.PHONY : test_blob.cpp.o
+
+test_blob.i: test_blob.cpp.i
+.PHONY : test_blob.i
+
+# target to preprocess a source file
+test_blob.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.i
+.PHONY : test_blob.cpp.i
+
+test_blob.s: test_blob.cpp.s
+.PHONY : test_blob.s
+
+# target to generate assembly for a file
+test_blob.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.s
+.PHONY : test_blob.cpp.s
+
+test_caffe_main.o: test_caffe_main.cpp.o
+.PHONY : test_caffe_main.o
+
+# target to build an object file
+test_caffe_main.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o
+.PHONY : test_caffe_main.cpp.o
+
+test_caffe_main.i: test_caffe_main.cpp.i
+.PHONY : test_caffe_main.i
+
+# target to preprocess a source file
+test_caffe_main.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i
+.PHONY : test_caffe_main.cpp.i
+
+test_caffe_main.s: test_caffe_main.cpp.s
+.PHONY : test_caffe_main.s
+
+# target to generate assembly for a file
+test_caffe_main.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s
+.PHONY : test_caffe_main.cpp.s
+
+test_common.o: test_common.cpp.o
+.PHONY : test_common.o
+
+# target to build an object file
+test_common.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o
+.PHONY : test_common.cpp.o
+
+test_common.i: test_common.cpp.i
+.PHONY : test_common.i
+
+# target to preprocess a source file
+test_common.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.i
+.PHONY : test_common.cpp.i
+
+test_common.s: test_common.cpp.s
+.PHONY : test_common.s
+
+# target to generate assembly for a file
+test_common.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.s
+.PHONY : test_common.cpp.s
+
+test_concat_layer.o: test_concat_layer.cpp.o
+.PHONY : test_concat_layer.o
+
+# target to build an object file
+test_concat_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o
+.PHONY : test_concat_layer.cpp.o
+
+test_concat_layer.i: test_concat_layer.cpp.i
+.PHONY : test_concat_layer.i
+
+# target to preprocess a source file
+test_concat_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i
+.PHONY : test_concat_layer.cpp.i
+
+test_concat_layer.s: test_concat_layer.cpp.s
+.PHONY : test_concat_layer.s
+
+# target to generate assembly for a file
+test_concat_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s
+.PHONY : test_concat_layer.cpp.s
+
+test_contrastive_loss_layer.o: test_contrastive_loss_layer.cpp.o
+.PHONY : test_contrastive_loss_layer.o
+
+# target to build an object file
+test_contrastive_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o
+.PHONY : test_contrastive_loss_layer.cpp.o
+
+test_contrastive_loss_layer.i: test_contrastive_loss_layer.cpp.i
+.PHONY : test_contrastive_loss_layer.i
+
+# target to preprocess a source file
+test_contrastive_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i
+.PHONY : test_contrastive_loss_layer.cpp.i
+
+test_contrastive_loss_layer.s: test_contrastive_loss_layer.cpp.s
+.PHONY : test_contrastive_loss_layer.s
+
+# target to generate assembly for a file
+test_contrastive_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s
+.PHONY : test_contrastive_loss_layer.cpp.s
+
+test_convolution_layer.o: test_convolution_layer.cpp.o
+.PHONY : test_convolution_layer.o
+
+# target to build an object file
+test_convolution_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o
+.PHONY : test_convolution_layer.cpp.o
+
+test_convolution_layer.i: test_convolution_layer.cpp.i
+.PHONY : test_convolution_layer.i
+
+# target to preprocess a source file
+test_convolution_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i
+.PHONY : test_convolution_layer.cpp.i
+
+test_convolution_layer.s: test_convolution_layer.cpp.s
+.PHONY : test_convolution_layer.s
+
+# target to generate assembly for a file
+test_convolution_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s
+.PHONY : test_convolution_layer.cpp.s
+
+test_data_layer.o: test_data_layer.cpp.o
+.PHONY : test_data_layer.o
+
+# target to build an object file
+test_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o
+.PHONY : test_data_layer.cpp.o
+
+test_data_layer.i: test_data_layer.cpp.i
+.PHONY : test_data_layer.i
+
+# target to preprocess a source file
+test_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.i
+.PHONY : test_data_layer.cpp.i
+
+test_data_layer.s: test_data_layer.cpp.s
+.PHONY : test_data_layer.s
+
+# target to generate assembly for a file
+test_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.s
+.PHONY : test_data_layer.cpp.s
+
+test_data_transformer.o: test_data_transformer.cpp.o
+.PHONY : test_data_transformer.o
+
+# target to build an object file
+test_data_transformer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o
+.PHONY : test_data_transformer.cpp.o
+
+test_data_transformer.i: test_data_transformer.cpp.i
+.PHONY : test_data_transformer.i
+
+# target to preprocess a source file
+test_data_transformer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i
+.PHONY : test_data_transformer.cpp.i
+
+test_data_transformer.s: test_data_transformer.cpp.s
+.PHONY : test_data_transformer.s
+
+# target to generate assembly for a file
+test_data_transformer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s
+.PHONY : test_data_transformer.cpp.s
+
+test_db.o: test_db.cpp.o
+.PHONY : test_db.o
+
+# target to build an object file
+test_db.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o
+.PHONY : test_db.cpp.o
+
+test_db.i: test_db.cpp.i
+.PHONY : test_db.i
+
+# target to preprocess a source file
+test_db.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.i
+.PHONY : test_db.cpp.i
+
+test_db.s: test_db.cpp.s
+.PHONY : test_db.s
+
+# target to generate assembly for a file
+test_db.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.s
+.PHONY : test_db.cpp.s
+
+test_deconvolution_layer.o: test_deconvolution_layer.cpp.o
+.PHONY : test_deconvolution_layer.o
+
+# target to build an object file
+test_deconvolution_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o
+.PHONY : test_deconvolution_layer.cpp.o
+
+test_deconvolution_layer.i: test_deconvolution_layer.cpp.i
+.PHONY : test_deconvolution_layer.i
+
+# target to preprocess a source file
+test_deconvolution_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i
+.PHONY : test_deconvolution_layer.cpp.i
+
+test_deconvolution_layer.s: test_deconvolution_layer.cpp.s
+.PHONY : test_deconvolution_layer.s
+
+# target to generate assembly for a file
+test_deconvolution_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s
+.PHONY : test_deconvolution_layer.cpp.s
+
+test_dummy_data_layer.o: test_dummy_data_layer.cpp.o
+.PHONY : test_dummy_data_layer.o
+
+# target to build an object file
+test_dummy_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o
+.PHONY : test_dummy_data_layer.cpp.o
+
+test_dummy_data_layer.i: test_dummy_data_layer.cpp.i
+.PHONY : test_dummy_data_layer.i
+
+# target to preprocess a source file
+test_dummy_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i
+.PHONY : test_dummy_data_layer.cpp.i
+
+test_dummy_data_layer.s: test_dummy_data_layer.cpp.s
+.PHONY : test_dummy_data_layer.s
+
+# target to generate assembly for a file
+test_dummy_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s
+.PHONY : test_dummy_data_layer.cpp.s
+
+test_eltwise_layer.o: test_eltwise_layer.cpp.o
+.PHONY : test_eltwise_layer.o
+
+# target to build an object file
+test_eltwise_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o
+.PHONY : test_eltwise_layer.cpp.o
+
+test_eltwise_layer.i: test_eltwise_layer.cpp.i
+.PHONY : test_eltwise_layer.i
+
+# target to preprocess a source file
+test_eltwise_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i
+.PHONY : test_eltwise_layer.cpp.i
+
+test_eltwise_layer.s: test_eltwise_layer.cpp.s
+.PHONY : test_eltwise_layer.s
+
+# target to generate assembly for a file
+test_eltwise_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s
+.PHONY : test_eltwise_layer.cpp.s
+
+test_euclidean_loss_layer.o: test_euclidean_loss_layer.cpp.o
+.PHONY : test_euclidean_loss_layer.o
+
+# target to build an object file
+test_euclidean_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o
+.PHONY : test_euclidean_loss_layer.cpp.o
+
+test_euclidean_loss_layer.i: test_euclidean_loss_layer.cpp.i
+.PHONY : test_euclidean_loss_layer.i
+
+# target to preprocess a source file
+test_euclidean_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i
+.PHONY : test_euclidean_loss_layer.cpp.i
+
+test_euclidean_loss_layer.s: test_euclidean_loss_layer.cpp.s
+.PHONY : test_euclidean_loss_layer.s
+
+# target to generate assembly for a file
+test_euclidean_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s
+.PHONY : test_euclidean_loss_layer.cpp.s
+
+test_filler.o: test_filler.cpp.o
+.PHONY : test_filler.o
+
+# target to build an object file
+test_filler.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o
+.PHONY : test_filler.cpp.o
+
+test_filler.i: test_filler.cpp.i
+.PHONY : test_filler.i
+
+# target to preprocess a source file
+test_filler.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.i
+.PHONY : test_filler.cpp.i
+
+test_filler.s: test_filler.cpp.s
+.PHONY : test_filler.s
+
+# target to generate assembly for a file
+test_filler.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.s
+.PHONY : test_filler.cpp.s
+
+test_filter_layer.o: test_filter_layer.cpp.o
+.PHONY : test_filter_layer.o
+
+# target to build an object file
+test_filter_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o
+.PHONY : test_filter_layer.cpp.o
+
+test_filter_layer.i: test_filter_layer.cpp.i
+.PHONY : test_filter_layer.i
+
+# target to preprocess a source file
+test_filter_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i
+.PHONY : test_filter_layer.cpp.i
+
+test_filter_layer.s: test_filter_layer.cpp.s
+.PHONY : test_filter_layer.s
+
+# target to generate assembly for a file
+test_filter_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s
+.PHONY : test_filter_layer.cpp.s
+
+test_flatten_layer.o: test_flatten_layer.cpp.o
+.PHONY : test_flatten_layer.o
+
+# target to build an object file
+test_flatten_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o
+.PHONY : test_flatten_layer.cpp.o
+
+test_flatten_layer.i: test_flatten_layer.cpp.i
+.PHONY : test_flatten_layer.i
+
+# target to preprocess a source file
+test_flatten_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i
+.PHONY : test_flatten_layer.cpp.i
+
+test_flatten_layer.s: test_flatten_layer.cpp.s
+.PHONY : test_flatten_layer.s
+
+# target to generate assembly for a file
+test_flatten_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s
+.PHONY : test_flatten_layer.cpp.s
+
+test_gradient_based_solver.o: test_gradient_based_solver.cpp.o
+.PHONY : test_gradient_based_solver.o
+
+# target to build an object file
+test_gradient_based_solver.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o
+.PHONY : test_gradient_based_solver.cpp.o
+
+test_gradient_based_solver.i: test_gradient_based_solver.cpp.i
+.PHONY : test_gradient_based_solver.i
+
+# target to preprocess a source file
+test_gradient_based_solver.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i
+.PHONY : test_gradient_based_solver.cpp.i
+
+test_gradient_based_solver.s: test_gradient_based_solver.cpp.s
+.PHONY : test_gradient_based_solver.s
+
+# target to generate assembly for a file
+test_gradient_based_solver.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s
+.PHONY : test_gradient_based_solver.cpp.s
+
+test_hdf5_output_layer.o: test_hdf5_output_layer.cpp.o
+.PHONY : test_hdf5_output_layer.o
+
+# target to build an object file
+test_hdf5_output_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o
+.PHONY : test_hdf5_output_layer.cpp.o
+
+test_hdf5_output_layer.i: test_hdf5_output_layer.cpp.i
+.PHONY : test_hdf5_output_layer.i
+
+# target to preprocess a source file
+test_hdf5_output_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i
+.PHONY : test_hdf5_output_layer.cpp.i
+
+test_hdf5_output_layer.s: test_hdf5_output_layer.cpp.s
+.PHONY : test_hdf5_output_layer.s
+
+# target to generate assembly for a file
+test_hdf5_output_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s
+.PHONY : test_hdf5_output_layer.cpp.s
+
+test_hdf5data_layer.o: test_hdf5data_layer.cpp.o
+.PHONY : test_hdf5data_layer.o
+
+# target to build an object file
+test_hdf5data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o
+.PHONY : test_hdf5data_layer.cpp.o
+
+test_hdf5data_layer.i: test_hdf5data_layer.cpp.i
+.PHONY : test_hdf5data_layer.i
+
+# target to preprocess a source file
+test_hdf5data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i
+.PHONY : test_hdf5data_layer.cpp.i
+
+test_hdf5data_layer.s: test_hdf5data_layer.cpp.s
+.PHONY : test_hdf5data_layer.s
+
+# target to generate assembly for a file
+test_hdf5data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s
+.PHONY : test_hdf5data_layer.cpp.s
+
+test_hinge_loss_layer.o: test_hinge_loss_layer.cpp.o
+.PHONY : test_hinge_loss_layer.o
+
+# target to build an object file
+test_hinge_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o
+.PHONY : test_hinge_loss_layer.cpp.o
+
+test_hinge_loss_layer.i: test_hinge_loss_layer.cpp.i
+.PHONY : test_hinge_loss_layer.i
+
+# target to preprocess a source file
+test_hinge_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i
+.PHONY : test_hinge_loss_layer.cpp.i
+
+test_hinge_loss_layer.s: test_hinge_loss_layer.cpp.s
+.PHONY : test_hinge_loss_layer.s
+
+# target to generate assembly for a file
+test_hinge_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s
+.PHONY : test_hinge_loss_layer.cpp.s
+
+test_im2col_layer.o: test_im2col_layer.cpp.o
+.PHONY : test_im2col_layer.o
+
+# target to build an object file
+test_im2col_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o
+.PHONY : test_im2col_layer.cpp.o
+
+test_im2col_layer.i: test_im2col_layer.cpp.i
+.PHONY : test_im2col_layer.i
+
+# target to preprocess a source file
+test_im2col_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i
+.PHONY : test_im2col_layer.cpp.i
+
+test_im2col_layer.s: test_im2col_layer.cpp.s
+.PHONY : test_im2col_layer.s
+
+# target to generate assembly for a file
+test_im2col_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s
+.PHONY : test_im2col_layer.cpp.s
+
+test_image_data_layer.o: test_image_data_layer.cpp.o
+.PHONY : test_image_data_layer.o
+
+# target to build an object file
+test_image_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o
+.PHONY : test_image_data_layer.cpp.o
+
+test_image_data_layer.i: test_image_data_layer.cpp.i
+.PHONY : test_image_data_layer.i
+
+# target to preprocess a source file
+test_image_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i
+.PHONY : test_image_data_layer.cpp.i
+
+test_image_data_layer.s: test_image_data_layer.cpp.s
+.PHONY : test_image_data_layer.s
+
+# target to generate assembly for a file
+test_image_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s
+.PHONY : test_image_data_layer.cpp.s
+
+test_infogain_loss_layer.o: test_infogain_loss_layer.cpp.o
+.PHONY : test_infogain_loss_layer.o
+
+# target to build an object file
+test_infogain_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o
+.PHONY : test_infogain_loss_layer.cpp.o
+
+test_infogain_loss_layer.i: test_infogain_loss_layer.cpp.i
+.PHONY : test_infogain_loss_layer.i
+
+# target to preprocess a source file
+test_infogain_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i
+.PHONY : test_infogain_loss_layer.cpp.i
+
+test_infogain_loss_layer.s: test_infogain_loss_layer.cpp.s
+.PHONY : test_infogain_loss_layer.s
+
+# target to generate assembly for a file
+test_infogain_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s
+.PHONY : test_infogain_loss_layer.cpp.s
+
+test_inner_product_layer.o: test_inner_product_layer.cpp.o
+.PHONY : test_inner_product_layer.o
+
+# target to build an object file
+test_inner_product_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o
+.PHONY : test_inner_product_layer.cpp.o
+
+test_inner_product_layer.i: test_inner_product_layer.cpp.i
+.PHONY : test_inner_product_layer.i
+
+# target to preprocess a source file
+test_inner_product_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i
+.PHONY : test_inner_product_layer.cpp.i
+
+test_inner_product_layer.s: test_inner_product_layer.cpp.s
+.PHONY : test_inner_product_layer.s
+
+# target to generate assembly for a file
+test_inner_product_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s
+.PHONY : test_inner_product_layer.cpp.s
+
+test_internal_thread.o: test_internal_thread.cpp.o
+.PHONY : test_internal_thread.o
+
+# target to build an object file
+test_internal_thread.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o
+.PHONY : test_internal_thread.cpp.o
+
+test_internal_thread.i: test_internal_thread.cpp.i
+.PHONY : test_internal_thread.i
+
+# target to preprocess a source file
+test_internal_thread.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i
+.PHONY : test_internal_thread.cpp.i
+
+test_internal_thread.s: test_internal_thread.cpp.s
+.PHONY : test_internal_thread.s
+
+# target to generate assembly for a file
+test_internal_thread.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s
+.PHONY : test_internal_thread.cpp.s
+
+test_io.o: test_io.cpp.o
+.PHONY : test_io.o
+
+# target to build an object file
+test_io.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o
+.PHONY : test_io.cpp.o
+
+test_io.i: test_io.cpp.i
+.PHONY : test_io.i
+
+# target to preprocess a source file
+test_io.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.i
+.PHONY : test_io.cpp.i
+
+test_io.s: test_io.cpp.s
+.PHONY : test_io.s
+
+# target to generate assembly for a file
+test_io.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.s
+.PHONY : test_io.cpp.s
+
+test_layer_factory.o: test_layer_factory.cpp.o
+.PHONY : test_layer_factory.o
+
+# target to build an object file
+test_layer_factory.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o
+.PHONY : test_layer_factory.cpp.o
+
+test_layer_factory.i: test_layer_factory.cpp.i
+.PHONY : test_layer_factory.i
+
+# target to preprocess a source file
+test_layer_factory.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i
+.PHONY : test_layer_factory.cpp.i
+
+test_layer_factory.s: test_layer_factory.cpp.s
+.PHONY : test_layer_factory.s
+
+# target to generate assembly for a file
+test_layer_factory.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s
+.PHONY : test_layer_factory.cpp.s
+
+test_lrn_layer.o: test_lrn_layer.cpp.o
+.PHONY : test_lrn_layer.o
+
+# target to build an object file
+test_lrn_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o
+.PHONY : test_lrn_layer.cpp.o
+
+test_lrn_layer.i: test_lrn_layer.cpp.i
+.PHONY : test_lrn_layer.i
+
+# target to preprocess a source file
+test_lrn_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i
+.PHONY : test_lrn_layer.cpp.i
+
+test_lrn_layer.s: test_lrn_layer.cpp.s
+.PHONY : test_lrn_layer.s
+
+# target to generate assembly for a file
+test_lrn_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s
+.PHONY : test_lrn_layer.cpp.s
+
+test_math_functions.o: test_math_functions.cpp.o
+.PHONY : test_math_functions.o
+
+# target to build an object file
+test_math_functions.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o
+.PHONY : test_math_functions.cpp.o
+
+test_math_functions.i: test_math_functions.cpp.i
+.PHONY : test_math_functions.i
+
+# target to preprocess a source file
+test_math_functions.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.i
+.PHONY : test_math_functions.cpp.i
+
+test_math_functions.s: test_math_functions.cpp.s
+.PHONY : test_math_functions.s
+
+# target to generate assembly for a file
+test_math_functions.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.s
+.PHONY : test_math_functions.cpp.s
+
+test_maxpool_dropout_layers.o: test_maxpool_dropout_layers.cpp.o
+.PHONY : test_maxpool_dropout_layers.o
+
+# target to build an object file
+test_maxpool_dropout_layers.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o
+.PHONY : test_maxpool_dropout_layers.cpp.o
+
+test_maxpool_dropout_layers.i: test_maxpool_dropout_layers.cpp.i
+.PHONY : test_maxpool_dropout_layers.i
+
+# target to preprocess a source file
+test_maxpool_dropout_layers.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i
+.PHONY : test_maxpool_dropout_layers.cpp.i
+
+test_maxpool_dropout_layers.s: test_maxpool_dropout_layers.cpp.s
+.PHONY : test_maxpool_dropout_layers.s
+
+# target to generate assembly for a file
+test_maxpool_dropout_layers.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s
+.PHONY : test_maxpool_dropout_layers.cpp.s
+
+test_memory_data_layer.o: test_memory_data_layer.cpp.o
+.PHONY : test_memory_data_layer.o
+
+# target to build an object file
+test_memory_data_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o
+.PHONY : test_memory_data_layer.cpp.o
+
+test_memory_data_layer.i: test_memory_data_layer.cpp.i
+.PHONY : test_memory_data_layer.i
+
+# target to preprocess a source file
+test_memory_data_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i
+.PHONY : test_memory_data_layer.cpp.i
+
+test_memory_data_layer.s: test_memory_data_layer.cpp.s
+.PHONY : test_memory_data_layer.s
+
+# target to generate assembly for a file
+test_memory_data_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s
+.PHONY : test_memory_data_layer.cpp.s
+
+test_multinomial_logistic_loss_layer.o: test_multinomial_logistic_loss_layer.cpp.o
+.PHONY : test_multinomial_logistic_loss_layer.o
+
+# target to build an object file
+test_multinomial_logistic_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o
+.PHONY : test_multinomial_logistic_loss_layer.cpp.o
+
+test_multinomial_logistic_loss_layer.i: test_multinomial_logistic_loss_layer.cpp.i
+.PHONY : test_multinomial_logistic_loss_layer.i
+
+# target to preprocess a source file
+test_multinomial_logistic_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i
+.PHONY : test_multinomial_logistic_loss_layer.cpp.i
+
+test_multinomial_logistic_loss_layer.s: test_multinomial_logistic_loss_layer.cpp.s
+.PHONY : test_multinomial_logistic_loss_layer.s
+
+# target to generate assembly for a file
+test_multinomial_logistic_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s
+.PHONY : test_multinomial_logistic_loss_layer.cpp.s
+
+test_mvn_layer.o: test_mvn_layer.cpp.o
+.PHONY : test_mvn_layer.o
+
+# target to build an object file
+test_mvn_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o
+.PHONY : test_mvn_layer.cpp.o
+
+test_mvn_layer.i: test_mvn_layer.cpp.i
+.PHONY : test_mvn_layer.i
+
+# target to preprocess a source file
+test_mvn_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i
+.PHONY : test_mvn_layer.cpp.i
+
+test_mvn_layer.s: test_mvn_layer.cpp.s
+.PHONY : test_mvn_layer.s
+
+# target to generate assembly for a file
+test_mvn_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s
+.PHONY : test_mvn_layer.cpp.s
+
+test_net.o: test_net.cpp.o
+.PHONY : test_net.o
+
+# target to build an object file
+test_net.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o
+.PHONY : test_net.cpp.o
+
+test_net.i: test_net.cpp.i
+.PHONY : test_net.i
+
+# target to preprocess a source file
+test_net.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.i
+.PHONY : test_net.cpp.i
+
+test_net.s: test_net.cpp.s
+.PHONY : test_net.s
+
+# target to generate assembly for a file
+test_net.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.s
+.PHONY : test_net.cpp.s
+
+test_neuron_layer.o: test_neuron_layer.cpp.o
+.PHONY : test_neuron_layer.o
+
+# target to build an object file
+test_neuron_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o
+.PHONY : test_neuron_layer.cpp.o
+
+test_neuron_layer.i: test_neuron_layer.cpp.i
+.PHONY : test_neuron_layer.i
+
+# target to preprocess a source file
+test_neuron_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i
+.PHONY : test_neuron_layer.cpp.i
+
+test_neuron_layer.s: test_neuron_layer.cpp.s
+.PHONY : test_neuron_layer.s
+
+# target to generate assembly for a file
+test_neuron_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s
+.PHONY : test_neuron_layer.cpp.s
+
+test_platform.o: test_platform.cpp.o
+.PHONY : test_platform.o
+
+# target to build an object file
+test_platform.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o
+.PHONY : test_platform.cpp.o
+
+test_platform.i: test_platform.cpp.i
+.PHONY : test_platform.i
+
+# target to preprocess a source file
+test_platform.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.i
+.PHONY : test_platform.cpp.i
+
+test_platform.s: test_platform.cpp.s
+.PHONY : test_platform.s
+
+# target to generate assembly for a file
+test_platform.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.s
+.PHONY : test_platform.cpp.s
+
+test_pooling_layer.o: test_pooling_layer.cpp.o
+.PHONY : test_pooling_layer.o
+
+# target to build an object file
+test_pooling_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o
+.PHONY : test_pooling_layer.cpp.o
+
+test_pooling_layer.i: test_pooling_layer.cpp.i
+.PHONY : test_pooling_layer.i
+
+# target to preprocess a source file
+test_pooling_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i
+.PHONY : test_pooling_layer.cpp.i
+
+test_pooling_layer.s: test_pooling_layer.cpp.s
+.PHONY : test_pooling_layer.s
+
+# target to generate assembly for a file
+test_pooling_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s
+.PHONY : test_pooling_layer.cpp.s
+
+test_power_layer.o: test_power_layer.cpp.o
+.PHONY : test_power_layer.o
+
+# target to build an object file
+test_power_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o
+.PHONY : test_power_layer.cpp.o
+
+test_power_layer.i: test_power_layer.cpp.i
+.PHONY : test_power_layer.i
+
+# target to preprocess a source file
+test_power_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.i
+.PHONY : test_power_layer.cpp.i
+
+test_power_layer.s: test_power_layer.cpp.s
+.PHONY : test_power_layer.s
+
+# target to generate assembly for a file
+test_power_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.s
+.PHONY : test_power_layer.cpp.s
+
+test_protobuf.o: test_protobuf.cpp.o
+.PHONY : test_protobuf.o
+
+# target to build an object file
+test_protobuf.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o
+.PHONY : test_protobuf.cpp.o
+
+test_protobuf.i: test_protobuf.cpp.i
+.PHONY : test_protobuf.i
+
+# target to preprocess a source file
+test_protobuf.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.i
+.PHONY : test_protobuf.cpp.i
+
+test_protobuf.s: test_protobuf.cpp.s
+.PHONY : test_protobuf.s
+
+# target to generate assembly for a file
+test_protobuf.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.s
+.PHONY : test_protobuf.cpp.s
+
+test_random_number_generator.o: test_random_number_generator.cpp.o
+.PHONY : test_random_number_generator.o
+
+# target to build an object file
+test_random_number_generator.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o
+.PHONY : test_random_number_generator.cpp.o
+
+test_random_number_generator.i: test_random_number_generator.cpp.i
+.PHONY : test_random_number_generator.i
+
+# target to preprocess a source file
+test_random_number_generator.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i
+.PHONY : test_random_number_generator.cpp.i
+
+test_random_number_generator.s: test_random_number_generator.cpp.s
+.PHONY : test_random_number_generator.s
+
+# target to generate assembly for a file
+test_random_number_generator.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s
+.PHONY : test_random_number_generator.cpp.s
+
+test_reduction_layer.o: test_reduction_layer.cpp.o
+.PHONY : test_reduction_layer.o
+
+# target to build an object file
+test_reduction_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o
+.PHONY : test_reduction_layer.cpp.o
+
+test_reduction_layer.i: test_reduction_layer.cpp.i
+.PHONY : test_reduction_layer.i
+
+# target to preprocess a source file
+test_reduction_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i
+.PHONY : test_reduction_layer.cpp.i
+
+test_reduction_layer.s: test_reduction_layer.cpp.s
+.PHONY : test_reduction_layer.s
+
+# target to generate assembly for a file
+test_reduction_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s
+.PHONY : test_reduction_layer.cpp.s
+
+test_reshape_layer.o: test_reshape_layer.cpp.o
+.PHONY : test_reshape_layer.o
+
+# target to build an object file
+test_reshape_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o
+.PHONY : test_reshape_layer.cpp.o
+
+test_reshape_layer.i: test_reshape_layer.cpp.i
+.PHONY : test_reshape_layer.i
+
+# target to preprocess a source file
+test_reshape_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i
+.PHONY : test_reshape_layer.cpp.i
+
+test_reshape_layer.s: test_reshape_layer.cpp.s
+.PHONY : test_reshape_layer.s
+
+# target to generate assembly for a file
+test_reshape_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s
+.PHONY : test_reshape_layer.cpp.s
+
+test_sigmoid_cross_entropy_loss_layer.o: test_sigmoid_cross_entropy_loss_layer.cpp.o
+.PHONY : test_sigmoid_cross_entropy_loss_layer.o
+
+# target to build an object file
+test_sigmoid_cross_entropy_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o
+.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.o
+
+test_sigmoid_cross_entropy_loss_layer.i: test_sigmoid_cross_entropy_loss_layer.cpp.i
+.PHONY : test_sigmoid_cross_entropy_loss_layer.i
+
+# target to preprocess a source file
+test_sigmoid_cross_entropy_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i
+.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.i
+
+test_sigmoid_cross_entropy_loss_layer.s: test_sigmoid_cross_entropy_loss_layer.cpp.s
+.PHONY : test_sigmoid_cross_entropy_loss_layer.s
+
+# target to generate assembly for a file
+test_sigmoid_cross_entropy_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s
+.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.s
+
+test_slice_layer.o: test_slice_layer.cpp.o
+.PHONY : test_slice_layer.o
+
+# target to build an object file
+test_slice_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o
+.PHONY : test_slice_layer.cpp.o
+
+test_slice_layer.i: test_slice_layer.cpp.i
+.PHONY : test_slice_layer.i
+
+# target to preprocess a source file
+test_slice_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i
+.PHONY : test_slice_layer.cpp.i
+
+test_slice_layer.s: test_slice_layer.cpp.s
+.PHONY : test_slice_layer.s
+
+# target to generate assembly for a file
+test_slice_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s
+.PHONY : test_slice_layer.cpp.s
+
+test_softmax_layer.o: test_softmax_layer.cpp.o
+.PHONY : test_softmax_layer.o
+
+# target to build an object file
+test_softmax_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o
+.PHONY : test_softmax_layer.cpp.o
+
+test_softmax_layer.i: test_softmax_layer.cpp.i
+.PHONY : test_softmax_layer.i
+
+# target to preprocess a source file
+test_softmax_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i
+.PHONY : test_softmax_layer.cpp.i
+
+test_softmax_layer.s: test_softmax_layer.cpp.s
+.PHONY : test_softmax_layer.s
+
+# target to generate assembly for a file
+test_softmax_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s
+.PHONY : test_softmax_layer.cpp.s
+
+test_softmax_with_loss_layer.o: test_softmax_with_loss_layer.cpp.o
+.PHONY : test_softmax_with_loss_layer.o
+
+# target to build an object file
+test_softmax_with_loss_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o
+.PHONY : test_softmax_with_loss_layer.cpp.o
+
+test_softmax_with_loss_layer.i: test_softmax_with_loss_layer.cpp.i
+.PHONY : test_softmax_with_loss_layer.i
+
+# target to preprocess a source file
+test_softmax_with_loss_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i
+.PHONY : test_softmax_with_loss_layer.cpp.i
+
+test_softmax_with_loss_layer.s: test_softmax_with_loss_layer.cpp.s
+.PHONY : test_softmax_with_loss_layer.s
+
+# target to generate assembly for a file
+test_softmax_with_loss_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s
+.PHONY : test_softmax_with_loss_layer.cpp.s
+
+test_solver.o: test_solver.cpp.o
+.PHONY : test_solver.o
+
+# target to build an object file
+test_solver.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o
+.PHONY : test_solver.cpp.o
+
+test_solver.i: test_solver.cpp.i
+.PHONY : test_solver.i
+
+# target to preprocess a source file
+test_solver.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.i
+.PHONY : test_solver.cpp.i
+
+test_solver.s: test_solver.cpp.s
+.PHONY : test_solver.s
+
+# target to generate assembly for a file
+test_solver.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.s
+.PHONY : test_solver.cpp.s
+
+test_split_layer.o: test_split_layer.cpp.o
+.PHONY : test_split_layer.o
+
+# target to build an object file
+test_split_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o
+.PHONY : test_split_layer.cpp.o
+
+test_split_layer.i: test_split_layer.cpp.i
+.PHONY : test_split_layer.i
+
+# target to preprocess a source file
+test_split_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.i
+.PHONY : test_split_layer.cpp.i
+
+test_split_layer.s: test_split_layer.cpp.s
+.PHONY : test_split_layer.s
+
+# target to generate assembly for a file
+test_split_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.s
+.PHONY : test_split_layer.cpp.s
+
+test_spp_layer.o: test_spp_layer.cpp.o
+.PHONY : test_spp_layer.o
+
+# target to build an object file
+test_spp_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o
+.PHONY : test_spp_layer.cpp.o
+
+test_spp_layer.i: test_spp_layer.cpp.i
+.PHONY : test_spp_layer.i
+
+# target to preprocess a source file
+test_spp_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i
+.PHONY : test_spp_layer.cpp.i
+
+test_spp_layer.s: test_spp_layer.cpp.s
+.PHONY : test_spp_layer.s
+
+# target to generate assembly for a file
+test_spp_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s
+.PHONY : test_spp_layer.cpp.s
+
+test_stochastic_pooling.o: test_stochastic_pooling.cpp.o
+.PHONY : test_stochastic_pooling.o
+
+# target to build an object file
+test_stochastic_pooling.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o
+.PHONY : test_stochastic_pooling.cpp.o
+
+test_stochastic_pooling.i: test_stochastic_pooling.cpp.i
+.PHONY : test_stochastic_pooling.i
+
+# target to preprocess a source file
+test_stochastic_pooling.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i
+.PHONY : test_stochastic_pooling.cpp.i
+
+test_stochastic_pooling.s: test_stochastic_pooling.cpp.s
+.PHONY : test_stochastic_pooling.s
+
+# target to generate assembly for a file
+test_stochastic_pooling.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s
+.PHONY : test_stochastic_pooling.cpp.s
+
+test_syncedmem.o: test_syncedmem.cpp.o
+.PHONY : test_syncedmem.o
+
+# target to build an object file
+test_syncedmem.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o
+.PHONY : test_syncedmem.cpp.o
+
+test_syncedmem.i: test_syncedmem.cpp.i
+.PHONY : test_syncedmem.i
+
+# target to preprocess a source file
+test_syncedmem.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i
+.PHONY : test_syncedmem.cpp.i
+
+test_syncedmem.s: test_syncedmem.cpp.s
+.PHONY : test_syncedmem.s
+
+# target to generate assembly for a file
+test_syncedmem.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s
+.PHONY : test_syncedmem.cpp.s
+
+test_tanh_layer.o: test_tanh_layer.cpp.o
+.PHONY : test_tanh_layer.o
+
+# target to build an object file
+test_tanh_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o
+.PHONY : test_tanh_layer.cpp.o
+
+test_tanh_layer.i: test_tanh_layer.cpp.i
+.PHONY : test_tanh_layer.i
+
+# target to preprocess a source file
+test_tanh_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i
+.PHONY : test_tanh_layer.cpp.i
+
+test_tanh_layer.s: test_tanh_layer.cpp.s
+.PHONY : test_tanh_layer.s
+
+# target to generate assembly for a file
+test_tanh_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s
+.PHONY : test_tanh_layer.cpp.s
+
+test_threshold_layer.o: test_threshold_layer.cpp.o
+.PHONY : test_threshold_layer.o
+
+# target to build an object file
+test_threshold_layer.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o
+.PHONY : test_threshold_layer.cpp.o
+
+test_threshold_layer.i: test_threshold_layer.cpp.i
+.PHONY : test_threshold_layer.i
+
+# target to preprocess a source file
+test_threshold_layer.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i
+.PHONY : test_threshold_layer.cpp.i
+
+test_threshold_layer.s: test_threshold_layer.cpp.s
+.PHONY : test_threshold_layer.s
+
+# target to generate assembly for a file
+test_threshold_layer.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s
+.PHONY : test_threshold_layer.cpp.s
+
+test_upgrade_proto.o: test_upgrade_proto.cpp.o
+.PHONY : test_upgrade_proto.o
+
+# target to build an object file
+test_upgrade_proto.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o
+.PHONY : test_upgrade_proto.cpp.o
+
+test_upgrade_proto.i: test_upgrade_proto.cpp.i
+.PHONY : test_upgrade_proto.i
+
+# target to preprocess a source file
+test_upgrade_proto.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i
+.PHONY : test_upgrade_proto.cpp.i
+
+test_upgrade_proto.s: test_upgrade_proto.cpp.s
+.PHONY : test_upgrade_proto.s
+
+# target to generate assembly for a file
+test_upgrade_proto.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s
+.PHONY : test_upgrade_proto.cpp.s
+
+test_util_blas.o: test_util_blas.cpp.o
+.PHONY : test_util_blas.o
+
+# target to build an object file
+test_util_blas.cpp.o:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o
+.PHONY : test_util_blas.cpp.o
+
+test_util_blas.i: test_util_blas.cpp.i
+.PHONY : test_util_blas.i
+
+# target to preprocess a source file
+test_util_blas.cpp.i:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.i
+.PHONY : test_util_blas.cpp.i
+
+test_util_blas.s: test_util_blas.cpp.s
+.PHONY : test_util_blas.s
+
+# target to generate assembly for a file
+test_util_blas.cpp.s:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.s
+.PHONY : test_util_blas.cpp.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... runtest"
+	@echo "... test.testbin"
+	@echo "... test_accuracy_layer.o"
+	@echo "... test_accuracy_layer.i"
+	@echo "... test_accuracy_layer.s"
+	@echo "... test_argmax_layer.o"
+	@echo "... test_argmax_layer.i"
+	@echo "... test_argmax_layer.s"
+	@echo "... test_benchmark.o"
+	@echo "... test_benchmark.i"
+	@echo "... test_benchmark.s"
+	@echo "... test_blob.o"
+	@echo "... test_blob.i"
+	@echo "... test_blob.s"
+	@echo "... test_caffe_main.o"
+	@echo "... test_caffe_main.i"
+	@echo "... test_caffe_main.s"
+	@echo "... test_common.o"
+	@echo "... test_common.i"
+	@echo "... test_common.s"
+	@echo "... test_concat_layer.o"
+	@echo "... test_concat_layer.i"
+	@echo "... test_concat_layer.s"
+	@echo "... test_contrastive_loss_layer.o"
+	@echo "... test_contrastive_loss_layer.i"
+	@echo "... test_contrastive_loss_layer.s"
+	@echo "... test_convolution_layer.o"
+	@echo "... test_convolution_layer.i"
+	@echo "... test_convolution_layer.s"
+	@echo "... test_data_layer.o"
+	@echo "... test_data_layer.i"
+	@echo "... test_data_layer.s"
+	@echo "... test_data_transformer.o"
+	@echo "... test_data_transformer.i"
+	@echo "... test_data_transformer.s"
+	@echo "... test_db.o"
+	@echo "... test_db.i"
+	@echo "... test_db.s"
+	@echo "... test_deconvolution_layer.o"
+	@echo "... test_deconvolution_layer.i"
+	@echo "... test_deconvolution_layer.s"
+	@echo "... test_dummy_data_layer.o"
+	@echo "... test_dummy_data_layer.i"
+	@echo "... test_dummy_data_layer.s"
+	@echo "... test_eltwise_layer.o"
+	@echo "... test_eltwise_layer.i"
+	@echo "... test_eltwise_layer.s"
+	@echo "... test_euclidean_loss_layer.o"
+	@echo "... test_euclidean_loss_layer.i"
+	@echo "... test_euclidean_loss_layer.s"
+	@echo "... test_filler.o"
+	@echo "... test_filler.i"
+	@echo "... test_filler.s"
+	@echo "... test_filter_layer.o"
+	@echo "... test_filter_layer.i"
+	@echo "... test_filter_layer.s"
+	@echo "... test_flatten_layer.o"
+	@echo "... test_flatten_layer.i"
+	@echo "... test_flatten_layer.s"
+	@echo "... test_gradient_based_solver.o"
+	@echo "... test_gradient_based_solver.i"
+	@echo "... test_gradient_based_solver.s"
+	@echo "... test_hdf5_output_layer.o"
+	@echo "... test_hdf5_output_layer.i"
+	@echo "... test_hdf5_output_layer.s"
+	@echo "... test_hdf5data_layer.o"
+	@echo "... test_hdf5data_layer.i"
+	@echo "... test_hdf5data_layer.s"
+	@echo "... test_hinge_loss_layer.o"
+	@echo "... test_hinge_loss_layer.i"
+	@echo "... test_hinge_loss_layer.s"
+	@echo "... test_im2col_layer.o"
+	@echo "... test_im2col_layer.i"
+	@echo "... test_im2col_layer.s"
+	@echo "... test_image_data_layer.o"
+	@echo "... test_image_data_layer.i"
+	@echo "... test_image_data_layer.s"
+	@echo "... test_infogain_loss_layer.o"
+	@echo "... test_infogain_loss_layer.i"
+	@echo "... test_infogain_loss_layer.s"
+	@echo "... test_inner_product_layer.o"
+	@echo "... test_inner_product_layer.i"
+	@echo "... test_inner_product_layer.s"
+	@echo "... test_internal_thread.o"
+	@echo "... test_internal_thread.i"
+	@echo "... test_internal_thread.s"
+	@echo "... test_io.o"
+	@echo "... test_io.i"
+	@echo "... test_io.s"
+	@echo "... test_layer_factory.o"
+	@echo "... test_layer_factory.i"
+	@echo "... test_layer_factory.s"
+	@echo "... test_lrn_layer.o"
+	@echo "... test_lrn_layer.i"
+	@echo "... test_lrn_layer.s"
+	@echo "... test_math_functions.o"
+	@echo "... test_math_functions.i"
+	@echo "... test_math_functions.s"
+	@echo "... test_maxpool_dropout_layers.o"
+	@echo "... test_maxpool_dropout_layers.i"
+	@echo "... test_maxpool_dropout_layers.s"
+	@echo "... test_memory_data_layer.o"
+	@echo "... test_memory_data_layer.i"
+	@echo "... test_memory_data_layer.s"
+	@echo "... test_multinomial_logistic_loss_layer.o"
+	@echo "... test_multinomial_logistic_loss_layer.i"
+	@echo "... test_multinomial_logistic_loss_layer.s"
+	@echo "... test_mvn_layer.o"
+	@echo "... test_mvn_layer.i"
+	@echo "... test_mvn_layer.s"
+	@echo "... test_net.o"
+	@echo "... test_net.i"
+	@echo "... test_net.s"
+	@echo "... test_neuron_layer.o"
+	@echo "... test_neuron_layer.i"
+	@echo "... test_neuron_layer.s"
+	@echo "... test_platform.o"
+	@echo "... test_platform.i"
+	@echo "... test_platform.s"
+	@echo "... test_pooling_layer.o"
+	@echo "... test_pooling_layer.i"
+	@echo "... test_pooling_layer.s"
+	@echo "... test_power_layer.o"
+	@echo "... test_power_layer.i"
+	@echo "... test_power_layer.s"
+	@echo "... test_protobuf.o"
+	@echo "... test_protobuf.i"
+	@echo "... test_protobuf.s"
+	@echo "... test_random_number_generator.o"
+	@echo "... test_random_number_generator.i"
+	@echo "... test_random_number_generator.s"
+	@echo "... test_reduction_layer.o"
+	@echo "... test_reduction_layer.i"
+	@echo "... test_reduction_layer.s"
+	@echo "... test_reshape_layer.o"
+	@echo "... test_reshape_layer.i"
+	@echo "... test_reshape_layer.s"
+	@echo "... test_sigmoid_cross_entropy_loss_layer.o"
+	@echo "... test_sigmoid_cross_entropy_loss_layer.i"
+	@echo "... test_sigmoid_cross_entropy_loss_layer.s"
+	@echo "... test_slice_layer.o"
+	@echo "... test_slice_layer.i"
+	@echo "... test_slice_layer.s"
+	@echo "... test_softmax_layer.o"
+	@echo "... test_softmax_layer.i"
+	@echo "... test_softmax_layer.s"
+	@echo "... test_softmax_with_loss_layer.o"
+	@echo "... test_softmax_with_loss_layer.i"
+	@echo "... test_softmax_with_loss_layer.s"
+	@echo "... test_solver.o"
+	@echo "... test_solver.i"
+	@echo "... test_solver.s"
+	@echo "... test_split_layer.o"
+	@echo "... test_split_layer.i"
+	@echo "... test_split_layer.s"
+	@echo "... test_spp_layer.o"
+	@echo "... test_spp_layer.i"
+	@echo "... test_spp_layer.s"
+	@echo "... test_stochastic_pooling.o"
+	@echo "... test_stochastic_pooling.i"
+	@echo "... test_stochastic_pooling.s"
+	@echo "... test_syncedmem.o"
+	@echo "... test_syncedmem.i"
+	@echo "... test_syncedmem.s"
+	@echo "... test_tanh_layer.o"
+	@echo "... test_tanh_layer.i"
+	@echo "... test_tanh_layer.s"
+	@echo "... test_threshold_layer.o"
+	@echo "... test_threshold_layer.i"
+	@echo "... test_threshold_layer.s"
+	@echo "... test_upgrade_proto.o"
+	@echo "... test_upgrade_proto.i"
+	@echo "... test_upgrade_proto.s"
+	@echo "... test_util_blas.o"
+	@echo "... test_util_blas.i"
+	@echo "... test_util_blas.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/src/caffe/test/cmake_install.cmake b/src/caffe/test/cmake_install.cmake
new file mode 100644
index 00000000..fa890cd7
--- /dev/null
+++ b/src/caffe/test/cmake_install.cmake
@@ -0,0 +1,34 @@
+# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test
+
+# Set the install prefix
+IF(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install")
+ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX)
+STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  IF(BUILD_TYPE)
+    STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  ELSE(BUILD_TYPE)
+    SET(CMAKE_INSTALL_CONFIG_NAME "Release")
+  ENDIF(BUILD_TYPE)
+  MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+
+# Set the component getting installed.
+IF(NOT CMAKE_INSTALL_COMPONENT)
+  IF(COMPONENT)
+    MESSAGE(STATUS "Install component: \"${COMPONENT}\"")
+    SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  ELSE(COMPONENT)
+    SET(CMAKE_INSTALL_COMPONENT)
+  ENDIF(COMPONENT)
+ENDIF(NOT CMAKE_INSTALL_COMPONENT)
+
+# Install shared libraries without execute permission?
+IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  SET(CMAKE_INSTALL_SO_NO_EXE "1")
+ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+
diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
index c8caf5ac..32643b3b 100644
--- a/src/caffe/test/test_caffe_main.cpp
+++ b/src/caffe/test/test_caffe_main.cpp
@@ -2,38 +2,27 @@
 // to allow a main function to be compiled into the binary.
 
 #include "caffe/caffe.hpp"
+#include "caffe/common.hpp"
 #include "caffe/test/test_caffe_main.hpp"
 
-namespace caffe {
-#ifndef CPU_ONLY
-  cudaDeviceProp CAFFE_TEST_CUDA_PROP;
-#endif
-}
-
-#ifndef CPU_ONLY
-using caffe::CAFFE_TEST_CUDA_PROP;
-#endif
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   caffe::GlobalInit(&argc, &argv);
 #ifndef CPU_ONLY
   // Before starting testing, let's first print out a few cuda defice info.
-  int device;
-  cudaGetDeviceCount(&device);
-  cout << "Cuda number of devices: " << device << endl;
+  int device = 0;
   if (argc > 1) {
     // Use the given device
     device = atoi(argv[1]);
-    cudaSetDevice(device);
+    caffe::amdDevice.Init(device);
     cout << "Setting to use device " << device << endl;
-  } else if (CUDA_TEST_DEVICE >= 0) {
+  } else if (OPENCL_TEST_DEVICE >= 0) {
     // Use the device assigned in build configuration; but with a lower priority
-    device = CUDA_TEST_DEVICE;
+    device = OPENCL_TEST_DEVICE;
   }
-  cudaGetDevice(&device);
   cout << "Current device id: " << device << endl;
-  cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device);
+  caffe::amdDevice.Init();
 #endif
   // invoke the test.
   return RUN_ALL_TESTS();
diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp
index b3a61b0f..6c80de1d 100644
--- a/src/caffe/test/test_common.cpp
+++ b/src/caffe/test/test_common.cpp
@@ -14,12 +14,13 @@ class CommonTest : public ::testing::Test {};
 
 #ifndef CPU_ONLY  // GPU Caffe singleton test.
 
+/*
 TEST_F(CommonTest, TestCublasHandlerGPU) {
   int cuda_device_id;
   CUDA_CHECK(cudaGetDevice(&cuda_device_id));
   EXPECT_TRUE(Caffe::cublas_handle());
 }
-
+*/
 #endif
 
 TEST_F(CommonTest, TestBrewMode) {
@@ -45,7 +46,7 @@ TEST_F(CommonTest, TestRandSeedCPU) {
 }
 
 #ifndef CPU_ONLY  // GPU Caffe singleton test.
-
+/*
 TEST_F(CommonTest, TestRandSeedGPU) {
   SyncedMemory data_a(10 * sizeof(unsigned int));
   SyncedMemory data_b(10 * sizeof(unsigned int));
@@ -60,7 +61,7 @@ TEST_F(CommonTest, TestRandSeedGPU) {
         ((const unsigned int*)(data_b.cpu_data()))[i]);
   }
 }
-
+*/
 #endif
 
 }  // namespace caffe
diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index 67d41fff..576095c1 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -122,10 +122,11 @@ class ConvolutionLayerTest : public MultiDeviceTest<TypeParam> {
   }
 
   virtual ~ConvolutionLayerTest() {
-    delete blob_bottom_;
+   /* delete blob_bottom_;
     delete blob_bottom_2_;
     delete blob_top_;
     delete blob_top_2_;
+   */
   }
 
   virtual Blob<Dtype>* MakeReferenceTop(Blob<Dtype>* top) {
diff --git a/src/caffe/test/test_data/generate_sample_data.py b/src/caffe/test/test_data/generate_sample_data.py
old mode 100644
new mode 100755
diff --git a/src/caffe/test/test_filter_layer.cpp b/src/caffe/test/test_filter_layer.cpp
index c641b6ef..801881e9 100644
--- a/src/caffe/test/test_filter_layer.cpp
+++ b/src/caffe/test/test_filter_layer.cpp
@@ -13,7 +13,7 @@
 #include "caffe/test/test_gradient_check_util.hpp"
 
 namespace caffe {
-
+/*
 template <typename TypeParam>
 class FilterLayerTest : public MultiDeviceTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
@@ -124,5 +124,5 @@ TYPED_TEST(FilterLayerTest, TestGradient) {
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 0);
 }
-
+*/
 }  // namespace caffe
diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp
index c03df173..7913b49c 100644
--- a/src/caffe/test/test_inner_product_layer.cpp
+++ b/src/caffe/test/test_inner_product_layer.cpp
@@ -13,9 +13,9 @@
 
 namespace caffe {
 
-#ifndef CPU_ONLY
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
-#endif
+//#ifndef CPU_ONLY
+//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+//#endif
 
 template <typename TypeParam>
 class InnerProductLayerTest : public MultiDeviceTest<TypeParam> {
@@ -57,12 +57,8 @@ TYPED_TEST(InnerProductLayerTest, TestSetUp) {
 
 TYPED_TEST(InnerProductLayerTest, TestForward) {
   typedef typename TypeParam::Dtype Dtype;
-  bool IS_VALID_CUDA = false;
-#ifndef CPU_ONLY
-  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
-#endif
   if (Caffe::mode() == Caffe::CPU ||
-      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
+      sizeof(Dtype) == 4 ) {
     LayerParameter layer_param;
     InnerProductParameter* inner_product_param =
         layer_param.mutable_inner_product_param();
@@ -87,12 +83,8 @@ TYPED_TEST(InnerProductLayerTest, TestForward) {
 
 TYPED_TEST(InnerProductLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
-  bool IS_VALID_CUDA = false;
-#ifndef CPU_ONLY
-  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
-#endif
   if (Caffe::mode() == Caffe::CPU ||
-      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
+      sizeof(Dtype) == 4 ) {
     LayerParameter layer_param;
     InnerProductParameter* inner_product_param =
         layer_param.mutable_inner_product_param();
diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp
index a095b544..a0f88065 100644
--- a/src/caffe/test/test_math_functions.cpp
+++ b/src/caffe/test/test_math_functions.cpp
@@ -232,7 +232,7 @@ TYPED_TEST(GPUMathFunctionsTest, TestCopy) {
   const int n = this->blob_bottom_->count();
   const TypeParam* bottom_data = this->blob_bottom_->gpu_data();
   TypeParam* top_data = this->blob_top_->mutable_gpu_data();
-  caffe_copy(n, bottom_data, top_data);
+  caffe_gpu_copy(n, bottom_data, top_data);
   bottom_data = this->blob_bottom_->cpu_data();
   top_data = this->blob_top_->mutable_cpu_data();
   for (int i = 0; i < n; ++i) {
diff --git a/src/caffe/test/test_platform.cpp b/src/caffe/test/test_platform.cpp
index f3513e08..7a30c2db 100644
--- a/src/caffe/test/test_platform.cpp
+++ b/src/caffe/test/test_platform.cpp
@@ -10,10 +10,10 @@
 
 namespace caffe {
 
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
 
 class PlatformTest : public ::testing::Test {};
-
+/*
 TEST_F(PlatformTest, TestInitialization) {
   printf("Major revision number:         %d\n",  CAFFE_TEST_CUDA_PROP.major);
   printf("Minor revision number:         %d\n",  CAFFE_TEST_CUDA_PROP.minor);
@@ -51,7 +51,7 @@ TEST_F(PlatformTest, TestInitialization) {
          (CAFFE_TEST_CUDA_PROP.unifiedAddressing ? "Yes" : "No"));
   EXPECT_TRUE(true);
 }
-
+*/
 }  // namespace caffe
 
 #endif  // CPU_ONLY
diff --git a/src/caffe/test/test_util_blas.cpp b/src/caffe/test/test_util_blas.cpp
index 8770f309..9cc9558c 100644
--- a/src/caffe/test/test_util_blas.cpp
+++ b/src/caffe/test/test_util_blas.cpp
@@ -12,7 +12,7 @@
 
 namespace caffe {
 
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
 
 template <typename TypeParam>
 class GemmTest : public ::testing::Test {};
@@ -30,7 +30,8 @@ TYPED_TEST(GemmTest, TestGemmCPUGPU) {
   caffe_copy(6, data, A.mutable_cpu_data());
   caffe_copy(12, data, B.mutable_cpu_data());
 
-  if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) {
+ // if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) {
+   if (sizeof(TypeParam) == 4) {
     // [1, 2, 3; 4 5 6] * [1, 2, 3, 4; 5, 6, 7, 8; 9, 10, 11, 12];
     caffe_cpu_gemm<TypeParam>(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1.,
         A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data());
@@ -100,7 +101,8 @@ TYPED_TEST(GemmTest, TestGemvCPUGPU) {
   caffe_copy(6, data, A.mutable_cpu_data());
   caffe_copy(3, data, x.mutable_cpu_data());
 
-  if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) {
+ // if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) {
+  if (sizeof(TypeParam) == 4) {
     caffe_cpu_gemv<TypeParam>(CblasNoTrans, 2, 3, 1., A.cpu_data(),
         x.cpu_data(), 0., y.mutable_cpu_data());
     for (int i = 0; i < 2; ++i) {
diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
index 1d269c35..2dcf0e5a 100644
--- a/src/caffe/util/benchmark.cpp
+++ b/src/caffe/util/benchmark.cpp
@@ -6,34 +6,16 @@
 namespace caffe {
 
 Timer::Timer()
-    : initted_(false),
-      running_(false),
-      has_run_at_least_once_(false) {
+    : initted_(false), running_(false), has_run_at_least_once_(false) {
   Init();
 }
 
 Timer::~Timer() {
-  if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-    CUDA_CHECK(cudaEventDestroy(start_gpu_));
-    CUDA_CHECK(cudaEventDestroy(stop_gpu_));
-#else
-    NO_GPU;
-#endif
-  }
 }
 
 void Timer::Start() {
   if (!running()) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      CUDA_CHECK(cudaEventRecord(start_gpu_, 0));
-#else
-      NO_GPU;
-#endif
-    } else {
-      start_cpu_ = boost::posix_time::microsec_clock::local_time();
-    }
+    start_cpu_ = boost::posix_time::microsec_clock::local_time();
     running_ = true;
     has_run_at_least_once_ = true;
   }
@@ -41,21 +23,11 @@ void Timer::Start() {
 
 void Timer::Stop() {
   if (running()) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      CUDA_CHECK(cudaEventRecord(stop_gpu_, 0));
-      CUDA_CHECK(cudaEventSynchronize(stop_gpu_));
-#else
-      NO_GPU;
-#endif
-    } else {
-      stop_cpu_ = boost::posix_time::microsec_clock::local_time();
-    }
+    stop_cpu_ = boost::posix_time::microsec_clock::local_time();
     running_ = false;
   }
 }
 
-
 float Timer::MicroSeconds() {
   if (!has_run_at_least_once()) {
     LOG(WARNING) << "Timer has never been run before reading time.";
@@ -64,18 +36,8 @@ float Timer::MicroSeconds() {
   if (running()) {
     Stop();
   }
-  if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
-                                    stop_gpu_));
-    // Cuda only measure milliseconds
-    elapsed_microseconds_ = elapsed_milliseconds_ * 1000;
-#else
-      NO_GPU;
-#endif
-  } else {
-    elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
-  }
+
+  elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
   return elapsed_microseconds_;
 }
 
@@ -87,16 +49,8 @@ float Timer::MilliSeconds() {
   if (running()) {
     Stop();
   }
-  if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
-                                    stop_gpu_));
-#else
-      NO_GPU;
-#endif
-  } else {
-    elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
-  }
+
+  elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
   return elapsed_milliseconds_;
 }
 
@@ -107,12 +61,6 @@ float Timer::Seconds() {
 void Timer::Init() {
   if (!initted()) {
     if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      CUDA_CHECK(cudaEventCreate(&start_gpu_));
-      CUDA_CHECK(cudaEventCreate(&stop_gpu_));
-#else
-      NO_GPU;
-#endif
     }
     initted_ = true;
   }
@@ -147,8 +95,8 @@ float CPUTimer::MilliSeconds() {
   if (running()) {
     Stop();
   }
-  this->elapsed_milliseconds_ = (this->stop_cpu_ -
-                                this->start_cpu_).total_milliseconds();
+  this->elapsed_milliseconds_ =
+      (this->stop_cpu_ - this->start_cpu_).total_milliseconds();
   return this->elapsed_milliseconds_;
 }
 
@@ -160,8 +108,8 @@ float CPUTimer::MicroSeconds() {
   if (running()) {
     Stop();
   }
-  this->elapsed_microseconds_ = (this->stop_cpu_ -
-                                this->start_cpu_).total_microseconds();
+  this->elapsed_microseconds_ =
+      (this->stop_cpu_ - this->start_cpu_).total_microseconds();
   return this->elapsed_microseconds_;
 }
 
diff --git a/src/caffe/util/cudnn.cpp b/src/caffe/util/cudnn.cpp
index 1772f009..592017c5 100644
--- a/src/caffe/util/cudnn.cpp
+++ b/src/caffe/util/cudnn.cpp
@@ -2,22 +2,22 @@
 #include "caffe/util/cudnn.hpp"
 
 namespace caffe {
-namespace cudnn {
+  namespace cudnn {
 
-float dataType<float>::oneval = 1.0;
-float dataType<float>::zeroval = 0.0;
-const void* dataType<float>::one =
+    float dataType<float>::oneval = 1.0;
+    float dataType<float>::zeroval = 0.0;
+    const void* dataType<float>::one =
     static_cast<void *>(&dataType<float>::oneval);
-const void* dataType<float>::zero =
+    const void* dataType<float>::zero =
     static_cast<void *>(&dataType<float>::zeroval);
 
-double dataType<double>::oneval = 1.0;
-double dataType<double>::zeroval = 0.0;
-const void* dataType<double>::one =
+    double dataType<double>::oneval = 1.0;
+    double dataType<double>::zeroval = 0.0;
+    const void* dataType<double>::one =
     static_cast<void *>(&dataType<double>::oneval);
-const void* dataType<double>::zero =
+    const void* dataType<double>::zero =
     static_cast<void *>(&dataType<double>::zeroval);
 
-}  // namespace cudnn
+  }  // namespace cudnn
 }  // namespace caffe
 #endif
diff --git a/src/caffe/util/db.cpp b/src/caffe/util/db.cpp
index f55420e9..fd4de1bf 100644
--- a/src/caffe/util/db.cpp
+++ b/src/caffe/util/db.cpp
@@ -4,7 +4,8 @@
 
 #include <string>
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
 DB* GetDB(DataParameter::DB backend) {
   switch (backend) {
diff --git a/src/caffe/util/db_leveldb.cpp b/src/caffe/util/db_leveldb.cpp
index 06c46627..d8eac5f7 100644
--- a/src/caffe/util/db_leveldb.cpp
+++ b/src/caffe/util/db_leveldb.cpp
@@ -2,7 +2,8 @@
 
 #include <string>
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
 void LevelDB::Open(const string& source, Mode mode) {
   leveldb::Options options;
@@ -12,8 +13,8 @@ void LevelDB::Open(const string& source, Mode mode) {
   options.error_if_exists = mode == NEW;
   options.create_if_missing = mode != READ;
   leveldb::Status status = leveldb::DB::Open(options, source, &db_);
-  CHECK(status.ok()) << "Failed to open leveldb " << source
-                     << std::endl << status.ToString();
+  CHECK(status.ok()) << "Failed to open leveldb " << source << std::endl
+      << status.ToString();
   LOG(INFO) << "Opened leveldb " << source;
 }
 
diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp
index a054b796..126b3790 100644
--- a/src/caffe/util/db_lmdb.cpp
+++ b/src/caffe/util/db_lmdb.cpp
@@ -4,14 +4,15 @@
 
 #include <string>
 
-namespace caffe { namespace db {
+namespace caffe {
+namespace db {
 
 const size_t LMDB_MAP_SIZE = 1099511627776;  // 1 TB
 
 void LMDB::Open(const string& source, Mode mode) {
   MDB_CHECK(mdb_env_create(&mdb_env_));
-  MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
-  if (mode == NEW) {
+  MDB_CHECK (mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));if
+(  mode == NEW) {
     CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed";
   }
   int flags = 0;
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index c48f31f3..e9c07970 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -1,18 +1,45 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
 
+#include "caffe/common.hpp"
 #include "caffe/util/im2col.hpp"
 #include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
+template <typename dtype> extern std::string get_dtype_suffix();
+
 template <typename Dtype>
-void im2col_cpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    Dtype* data_col) {
+void im2col_cpu(const Dtype* data_im, const int channels, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) {
   int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
   int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
   int channels_col = channels * kernel_h * kernel_w;
@@ -25,8 +52,8 @@ void im2col_cpu(const Dtype* data_im, const int channels,
         int h_pad = h * stride_h - pad_h + h_offset;
         int w_pad = w * stride_w - pad_w + w_offset;
         if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_col[(c * height_col + h) * width_col + w] =
-            data_im[(c_im * height + h_pad) * width + w_pad];
+          data_col[(c * height_col + h) * width_col + w] = data_im[(c_im
+              * height + h_pad) * width + w_pad];
         else
           data_col[(c * height_col + h) * width_col + w] = 0;
       }
@@ -34,22 +61,19 @@ void im2col_cpu(const Dtype* data_im, const int channels,
   }
 }
 
-// Explicit instantiation
 template void im2col_cpu<float>(const float* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_col);
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    float* data_col);
 template void im2col_cpu<double>(const double* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, double* data_col);
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    double* data_col);
 
 template <typename Dtype>
-void col2im_cpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    Dtype* data_im) {
+void col2im_cpu(const Dtype* data_col, const int channels, const int height,
+    const int width, const int patch_h, const int patch_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) {
   caffe_set(height * width * channels, Dtype(0), data_im);
   int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
   int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
@@ -63,21 +87,214 @@ void col2im_cpu(const Dtype* data_col, const int channels,
         int h_pad = h * stride_h - pad_h + h_offset;
         int w_pad = w * stride_w - pad_w + w_offset;
         if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_im[(c_im * height + h_pad) * width + w_pad] +=
-              data_col[(c * height_col + h) * width_col + w];
+          data_im[(c_im * height + h_pad) * width + w_pad] += data_col[(c
+              * height_col + h) * width_col + w];
       }
     }
   }
 }
 
-// Explicit instantiation
 template void col2im_cpu<float>(const float* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_im);
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    float* data_im);
 template void col2im_cpu<double>(const double* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, double* data_im);
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    double* data_im);
+
+#ifndef CPU_ONLY
+template <typename Dtype>
+void col2im_gpu_opt(const Dtype* data_col, const int col_offset,
+    const int channels, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im, const int img_offset,
+    int optnum) {
+  std::string kernel_name = "col2im_opt" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int num_kernels = channels * height * width * optnum;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &kernel_w);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset);
+  ret |= clSetKernelArg(Kernel, 16, sizeof(cl_int), (void*) &optnum);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void col2im_gpu_opt<float>(const float* data_col, const int col_offset,
+    const int channels, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    float* data_im, const int img_offset,
+    int optnum);
+template void col2im_gpu_opt<double>(const double* data_col,
+    const int col_offset, const int channels, const int height, const int width,
+   const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,   double* data_im,
+    const int img_offset, int optnum);
+
+template <typename Dtype>
+void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    Dtype* data_col, const int col_offset) {
+  std::string kernel_name = "im2col" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_w);
+
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &data_col);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &col_offset);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+
+}
+
+template void im2col_gpu<float>(const float* data_im, const int img_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, float* data_col, const int col_offset);
+template void im2col_gpu<double>(const double* data_im, const int img_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, double* data_col, const int col_offset);
+
+template <typename Dtype>
+void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, const int height,
+    const int width,  const int patch_h, const int patch_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    Dtype* data_im, const int img_offset) {
+  std::string kernel_name = "col2im" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int num_kernels = channels * height * width;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &patch_h);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &patch_w);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void col2im_gpu<float>(const float* data_col, const int col_offset,
+    const int channels, const int height, const int width, const int patch_h,
+    const int patch_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, float* data_im, const int img_offset);
+template void col2im_gpu<double>(const double* data_col, const int col_offset,
+    const int channels, const int height, const int width, const int patch_h,
+    const int patch_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, double* data_im, const int img_offset);
+
+template <typename Dtype>
+void im2col_gpu_opt(const Dtype* data_im, const int img_offset,
+    const int channels, const int height, const int width,const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col, const int col_offset,
+    int optnum) {
+
+  std::string kernel_name = "im2col_opt" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int num_kernels = optnum * channels * height_col * width_col;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &img_offset);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &kernel_w);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_col);
+  ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &col_offset);
+  ret |= clSetKernelArg(Kernel, 16, sizeof(cl_int), (void*) &optnum);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { (size_t)(256 - 256 % width_col) };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void im2col_gpu_opt<float>(const float* data_im, const int img_offset,
+    const int channels, const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* data_col, const int col_offset,
+    int optnum);
+template void im2col_gpu_opt<double>(const double* data_im,
+    const int img_offset, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_col,
+    const int col_offset, int optnum);
 
+#endif
 }  // namespace caffe
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
index c90f93eb..0848017a 100644
--- a/src/caffe/util/im2col.cu
+++ b/src/caffe/util/im2col.cu
@@ -32,7 +32,7 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
         int h = h_in + i;
         int w = w_in + j;
         *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
-            data_im_ptr[i * width + j] : 0;
+        data_im_ptr[i * width + j] : 0;
         data_col_ptr += height_col * width_col;
       }
     }
@@ -40,11 +40,9 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
 }
 
 template <typename Dtype>
-void im2col_gpu(const Dtype* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    Dtype* data_col) {
+void im2col_gpu(const Dtype* data_im, const int channels, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) {
   // We are going to launch channels * height_col * width_col kernels, each
   // kernel responsible for copying a single-channel grid.
   int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
@@ -52,14 +50,13 @@ void im2col_gpu(const Dtype* data_im, const int channels,
   int num_kernels = channels * height_col * width_col;
   // NOLINT_NEXT_LINE(whitespace/operators)
   im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
-                             CAFFE_CUDA_NUM_THREADS>>>(
+  CAFFE_CUDA_NUM_THREADS>>>(
       num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
       pad_w, stride_h, stride_w, height_col,
       width_col, data_col);
   CUDA_POST_KERNEL_CHECK;
 }
 
-
 // Explicit instantiation
 template void im2col_gpu<float>(const float* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
@@ -88,19 +85,9 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
     int w_col_end = min(w / stride_w + 1, width_col);
     int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
     int h_col_end = min(h / stride_h + 1, height_col);
-    /*
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        // the col location: [c * width * height + h_out, w_out]
-        int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize
-            + (w - w_col * stride_w);
-        val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-      }
-    }
-    */
     // equivalent implementation
     int offset =
-        (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
+    (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
     int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
     int coeff_w_col = (1 - stride_w * height_col * width_col);
     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
@@ -113,10 +100,9 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
 }
 
 template <typename Dtype>
-void col2im_gpu(const Dtype* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_im) {
+void col2im_gpu(const Dtype* data_col, const int channels, const int height,
+    const int width, const int patch_h, const int patch_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) {
   int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
   int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
   int num_kernels = channels * height * width;
@@ -124,7 +110,7 @@ void col2im_gpu(const Dtype* data_col, const int channels,
   // bottom dimension, and then in the kernel add up the top dimensions.
   // NOLINT_NEXT_LINE(whitespace/operators)
   col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
-                             CAFFE_CUDA_NUM_THREADS>>>(
+  CAFFE_CUDA_NUM_THREADS>>>(
       num_kernels, data_col, height, width, channels, patch_h, patch_w,
       pad_h, pad_w, stride_h, stride_w,
       height_col, width_col, data_im);
@@ -134,11 +120,11 @@ void col2im_gpu(const Dtype* data_col, const int channels,
 // Explicit instantiation
 template void col2im_gpu<float>(const float* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_im);
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    float* data_im);
 template void col2im_gpu<double>(const double* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, double* data_im);
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    double* data_im);
 
 }  // namespace caffe
diff --git a/src/caffe/util/insert_splits.cpp b/src/caffe/util/insert_splits.cpp
index 416f80ab..7974b0ea 100644
--- a/src/caffe/util/insert_splits.cpp
+++ b/src/caffe/util/insert_splits.cpp
@@ -30,8 +30,8 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) {
     layer_idx_to_layer_name[i] = layer_param.name();
     for (int j = 0; j < layer_param.bottom_size(); ++j) {
       const string& blob_name = layer_param.bottom(j);
-      if (blob_name_to_last_top_idx.find(blob_name) ==
-          blob_name_to_last_top_idx.end()) {
+      if (blob_name_to_last_top_idx.find(blob_name)
+          == blob_name_to_last_top_idx.end()) {
         LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
       }
       const pair<int, int>& bottom_idx = make_pair(i, j);
@@ -45,8 +45,8 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) {
     }
     // A use of a top blob as a loss should be handled similarly to the use of
     // a top blob as an input (bottom) blob to another layer.
-    const int last_loss =
-        std::min(layer_param.loss_weight_size(), layer_param.top_size());
+    const int last_loss = std::min(layer_param.loss_weight_size(),
+        layer_param.top_size());
     for (int j = 0; j < last_loss; ++j) {
       const string& blob_name = layer_param.top(j);
       const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
@@ -74,14 +74,15 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) {
     layer_param->CopyFrom(param.layer(i));
     // Replace any shared bottom blobs with split layer outputs.
     for (int j = 0; j < layer_param->bottom_size(); ++j) {
-      const pair<int, int>& top_idx =
-          bottom_idx_to_source_top_idx[make_pair(i, j)];
+      const pair<int, int>& top_idx = bottom_idx_to_source_top_idx[make_pair(i,
+          j)];
       const int split_count = top_idx_to_bottom_count[top_idx];
       if (split_count > 1) {
         const string& layer_name = layer_idx_to_layer_name[top_idx.first];
         const string& blob_name = layer_param->bottom(j);
-        layer_param->set_bottom(j, SplitBlobName(layer_name,
-            blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++));
+        layer_param->set_bottom(j,
+            SplitBlobName(layer_name, blob_name, top_idx.second,
+                top_idx_to_bottom_split_idx[top_idx]++));
       }
     }
     // Create split layer for any top blobs used by other layer as bottom
@@ -94,8 +95,8 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) {
         const string& blob_name = layer_param->top(j);
         LayerParameter* split_layer_param = param_split->add_layer();
         const float loss_weight = top_idx_to_loss_weight[top_idx];
-        ConfigureSplitLayer(layer_name, blob_name, j, split_count,
-            loss_weight, split_layer_param);
+        ConfigureSplitLayer(layer_name, blob_name, j, split_count, loss_weight,
+            split_layer_param);
         if (loss_weight) {
           layer_param->clear_loss_weight();
           top_idx_to_bottom_split_idx[top_idx]++;
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 77ef7f25..09824880 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -67,11 +67,10 @@ void WriteProtoToBinaryFile(const Message& proto, const char* filename) {
   CHECK(proto.SerializeToOstream(&output));
 }
 
-cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width, const bool is_color) {
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width, const bool is_color) {
   cv::Mat cv_img;
-  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
-    CV_LOAD_IMAGE_GRAYSCALE);
+  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
   cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag);
   if (!cv_img_origin.data) {
     LOG(ERROR) << "Could not open or find file " << filename;
@@ -85,13 +84,12 @@ cv::Mat ReadImageToCVMat(const string& filename,
   return cv_img;
 }
 
-cv::Mat ReadImageToCVMat(const string& filename,
-    const int height, const int width) {
+cv::Mat ReadImageToCVMat(const string& filename, const int height,
+    const int width) {
   return ReadImageToCVMat(filename, height, width, true);
 }
 
-cv::Mat ReadImageToCVMat(const string& filename,
-    const bool is_color) {
+cv::Mat ReadImageToCVMat(const string& filename, const bool is_color) {
   return ReadImageToCVMat(filename, 0, 0, is_color);
 }
 
@@ -99,31 +97,30 @@ cv::Mat ReadImageToCVMat(const string& filename) {
   return ReadImageToCVMat(filename, 0, 0, true);
 }
 // Do the file extension and encoding match?
-static bool matchExt(const std::string & fn,
-                     std::string en) {
+static bool matchExt(const std::string & fn, std::string en) {
   size_t p = fn.rfind('.');
   std::string ext = p != fn.npos ? fn.substr(p) : fn;
   std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
   std::transform(en.begin(), en.end(), en.begin(), ::tolower);
-  if ( ext == en )
+  if (ext == en)
     return true;
-  if ( en == "jpg" && ext == "jpeg" )
+  if (en == "jpg" && ext == "jpeg")
     return true;
   return false;
 }
-bool ReadImageToDatum(const string& filename, const int label,
-    const int height, const int width, const bool is_color,
-    const std::string & encoding, Datum* datum) {
+bool ReadImageToDatum(const string& filename, const int label, const int height,
+    const int width, const bool is_color, const std::string & encoding,
+    Datum* datum) {
   cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color);
   if (cv_img.data) {
     if (encoding.size()) {
-      if ( (cv_img.channels() == 3) == is_color && !height && !width &&
-          matchExt(filename, encoding) )
+      if ((cv_img.channels() == 3) == is_color && !height && !width
+          && matchExt(filename, encoding))
         return ReadFileToDatum(filename, label, datum);
-      std::vector<uchar> buf;
-      cv::imencode("."+encoding, cv_img, buf);
-      datum->set_data(std::string(reinterpret_cast<char*>(&buf[0]),
-                      buf.size()));
+      std::vector < uchar > buf;
+      cv::imencode("." + encoding, cv_img, buf);
+      datum->set_data(
+          std::string(reinterpret_cast<char*>(&buf[0]), buf.size()));
       datum->set_label(label);
       datum->set_encoded(true);
       return true;
@@ -136,11 +133,10 @@ bool ReadImageToDatum(const string& filename, const int label,
   }
 }
 
-bool ReadFileToDatum(const string& filename, const int label,
-    Datum* datum) {
+bool ReadFileToDatum(const string& filename, const int label, Datum* datum) {
   std::streampos size;
 
-  fstream file(filename.c_str(), ios::in|ios::binary|ios::ate);
+  fstream file(filename.c_str(), ios::in | ios::binary | ios::ate);
   if (file.is_open()) {
     size = file.tellg();
     std::string buffer(size, ' ');
@@ -172,8 +168,7 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color) {
   CHECK(datum.encoded()) << "Datum not encoded";
   const string& data = datum.data();
   std::vector<char> vec_data(data.c_str(), data.c_str() + data.size());
-  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR :
-    CV_LOAD_IMAGE_GRAYSCALE);
+  int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
   cv_img = cv::imdecode(vec_data, cv_read_flag);
   if (!cv_img.data) {
     LOG(ERROR) << "Could not decode datum ";
@@ -216,7 +211,7 @@ void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) {
   int datum_size = datum_channels * datum_height * datum_width;
   std::string buffer(datum_size, ' ');
   for (int h = 0; h < datum_height; ++h) {
-    const uchar* ptr = cv_img.ptr<uchar>(h);
+    const uchar* ptr = cv_img.ptr < uchar > (h);
     int img_index = 0;
     for (int w = 0; w < datum_width; ++w) {
       for (int c = 0; c < datum_channels; ++c) {
@@ -230,9 +225,8 @@ void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) {
 
 // Verifies format of data stored in HDF5 file and reshapes blob accordingly.
 template <typename Dtype>
-void hdf5_load_nd_dataset_helper(
-    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob) {
+void hdf5_load_nd_dataset_helper(hid_t file_id, const char* dataset_name_,
+    int min_dim, int max_dim, Blob<Dtype>* blob) {
   // Verify that the dataset exists.
   CHECK(H5LTfind_dataset(file_id, dataset_name_))
       << "Failed to find HDF5 dataset " << dataset_name_;
@@ -245,10 +239,10 @@ void hdf5_load_nd_dataset_helper(
   CHECK_LE(ndims, max_dim);
 
   // Verify that the data format is what we expect: float or double.
-  std::vector<hsize_t> dims(ndims);
+  std::vector < hsize_t > dims(ndims);
   H5T_class_t class_;
-  status = H5LTget_dataset_info(
-      file_id, dataset_name_, dims.data(), &class_, NULL);
+  status = H5LTget_dataset_info(file_id, dataset_name_, dims.data(), &class_,
+      NULL);
   CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
   CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data";
 
@@ -261,45 +255,45 @@ void hdf5_load_nd_dataset_helper(
 
 template <>
 void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<float>* blob) {
+    int min_dim, int max_dim, Blob<float>* blob) {
   hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
-  herr_t status = H5LTread_dataset_float(
-    file_id, dataset_name_, blob->mutable_cpu_data());
+  herr_t status = H5LTread_dataset_float(file_id, dataset_name_,
+      blob->mutable_cpu_data());
   CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_;
 }
 
 template <>
 void hdf5_load_nd_dataset<double>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<double>* blob) {
+    int min_dim, int max_dim, Blob<double>* blob) {
   hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
-  herr_t status = H5LTread_dataset_double(
-    file_id, dataset_name_, blob->mutable_cpu_data());
+  herr_t status = H5LTread_dataset_double(file_id, dataset_name_,
+      blob->mutable_cpu_data());
   CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_;
 }
 
 template <>
-void hdf5_save_nd_dataset<float>(
-    const hid_t file_id, const string& dataset_name, const Blob<float>& blob) {
+void hdf5_save_nd_dataset<float>(const hid_t file_id,
+    const string& dataset_name, const Blob<float>& blob) {
   hsize_t dims[HDF5_NUM_DIMS];
   dims[0] = blob.num();
   dims[1] = blob.channels();
   dims[2] = blob.height();
   dims[3] = blob.width();
-  herr_t status = H5LTmake_dataset_float(
-      file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
+  herr_t status = H5LTmake_dataset_float(file_id, dataset_name.c_str(),
+      HDF5_NUM_DIMS, dims, blob.cpu_data());
   CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name;
 }
 
 template <>
-void hdf5_save_nd_dataset<double>(
-    const hid_t file_id, const string& dataset_name, const Blob<double>& blob) {
+void hdf5_save_nd_dataset<double>(const hid_t file_id,
+    const string& dataset_name, const Blob<double>& blob) {
   hsize_t dims[HDF5_NUM_DIMS];
   dims[0] = blob.num();
   dims[1] = blob.channels();
   dims[2] = blob.height();
   dims[3] = blob.width();
-  herr_t status = H5LTmake_dataset_double(
-      file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
+  herr_t status = H5LTmake_dataset_double(file_id, dataset_name.c_str(),
+      HDF5_NUM_DIMS, dims, blob.cpu_data());
   CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name;
 }
 
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 0aab6b17..e45fd564 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,3 +1,29 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
 #include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
 
@@ -6,29 +32,34 @@
 #include "caffe/common.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/rng.hpp"
+#include "caffe/util/ocl_util.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
+
+static const clblasOrder order = clblasColumnMajor;
+#define pi 3.1415926
 
 namespace caffe {
 
-template<>
+template <>
 void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C) {
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
-  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
-      ldb, beta, C, N);
+  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+      beta, C, N);
 }
 
-template<>
+template <>
 void caffe_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C) {
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
-  cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
-      ldb, beta, C, N);
+  cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+      beta, C, N);
 }
 
 template <>
@@ -47,16 +78,20 @@ void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
 
 template <>
 void caffe_axpy<float>(const int N, const float alpha, const float* X,
-    float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); }
+    float* Y) {
+  cblas_saxpy(N, alpha, X, 1, Y, 1);
+}
 
 template <>
 void caffe_axpy<double>(const int N, const double alpha, const double* X,
-    double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); }
+    double* Y) {
+  cblas_daxpy(N, alpha, X, 1, Y, 1);
+}
 
-template <typename Dtype>
-void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
+template <>
+void caffe_set(const int N, const float alpha, float* Y) {
   if (alpha == 0) {
-    memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    memset(Y, 0, sizeof(float) * N);
     return;
   }
   for (int i = 0; i < N; ++i) {
@@ -64,10 +99,28 @@ void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
   }
 }
 
-template void caffe_set<int>(const int N, const int alpha, int* Y);
-template void caffe_set<float>(const int N, const float alpha, float* Y);
-template void caffe_set<double>(const int N, const double alpha, double* Y);
+template <>
+void caffe_set(const int N, const double alpha, double* Y) {
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(double) * N);
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+    Y[i] = alpha;
+  }
+}
+
+/*
+template <>
+void caffe_copy<float>(const int N, const float* X, float* Y) {
+  cblas_scopy(N, X, 1, Y, 1);
+}
 
+template <>
+void caffe_copy<double>(const int N, const double* X, double* Y) {
+  cblas_dcopy(N, X, 1, Y, 1);
+}
+*/
 template <>
 void caffe_add_scalar(const int N, const float alpha, float* Y) {
   for (int i = 0; i < N; ++i) {
@@ -82,28 +135,6 @@ void caffe_add_scalar(const int N, const double alpha, double* Y) {
   }
 }
 
-template <typename Dtype>
-void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
-  if (X != Y) {
-    if (Caffe::mode() == Caffe::GPU) {
-#ifndef CPU_ONLY
-      // NOLINT_NEXT_LINE(caffe/alt_fn)
-      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
-#else
-      NO_GPU;
-#endif
-    } else {
-      memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
-    }
-  }
-}
-
-template void caffe_copy<int>(const int N, const int* X, int* Y);
-template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
-    unsigned int* Y);
-template void caffe_copy<float>(const int N, const float* X, float* Y);
-template void caffe_copy<double>(const int N, const double* X, double* Y);
-
 template <>
 void caffe_scal<float>(const int N, const float alpha, float *X) {
   cblas_sscal(N, alpha, X, 1);
@@ -116,19 +147,18 @@ void caffe_scal<double>(const int N, const double alpha, double *X) {
 
 template <>
 void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
-                            const float beta, float* Y) {
+    const float beta, float* Y) {
   cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
 void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
-                             const double beta, double* Y) {
+    const double beta, double* Y) {
   cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
 }
 
 template <>
-void caffe_add<float>(const int n, const float* a, const float* b,
-    float* y) {
+void caffe_add<float>(const int n, const float* a, const float* b, float* y) {
   vsAdd(n, a, b, y);
 }
 
@@ -139,8 +169,7 @@ void caffe_add<double>(const int n, const double* a, const double* b,
 }
 
 template <>
-void caffe_sub<float>(const int n, const float* a, const float* b,
-    float* y) {
+void caffe_sub<float>(const int n, const float* a, const float* b, float* y) {
   vsSub(n, a, b, y);
 }
 
@@ -151,8 +180,7 @@ void caffe_sub<double>(const int n, const double* a, const double* b,
 }
 
 template <>
-void caffe_mul<float>(const int n, const float* a, const float* b,
-    float* y) {
+void caffe_mul<float>(const int n, const float* a, const float* b, float* y) {
   vsMul(n, a, b, y);
 }
 
@@ -163,8 +191,67 @@ void caffe_mul<double>(const int n, const double* a, const double* b,
 }
 
 template <>
-void caffe_div<float>(const int n, const float* a, const float* b,
-    float* y) {
+float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
+    const float* y, const int incy) {
+  return cblas_sdot(n, x, incx, y, incy);
+}
+
+template <>
+double caffe_cpu_strided_dot<double>(const int n, const double* x,
+    const int incx, const double* y, const int incy) {
+  return cblas_ddot(n, x, incx, y, incy);
+}
+
+template <typename Dtype>
+void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+	    Y[i] = alpha;
+  }
+}
+
+template void caffe_set<int>(const int N, const int alpha, int* Y);
+template void caffe_set<float>(const int N, const float alpha, float* Y);
+template void caffe_set<double>(const int N, const double alpha, double* Y);
+
+template <>
+void caffe_log<float>(const int n, const float* a, float* y) {
+  vsLn(n, a, y);
+}
+
+template <>
+void caffe_log<double>(const int n, const double* a, double* y) {
+  vdLn(n, a, y);
+}
+
+template <typename Dtype>
+void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
+  if (X != Y) {
+      memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+  }
+}
+
+template void caffe_copy<int>(const int N, const int* X, int* Y);
+template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
+    unsigned int* Y);
+template void caffe_copy<float>(const int N, const float* X, float* Y);
+template void caffe_copy<double>(const int N, const double* X, double* Y);
+
+template <>
+void caffe_abs<float>(const int n, const float* a, float* y) {
+  vsAbs(n, a, y);
+}
+
+template <>
+void caffe_abs<double>(const int n, const double* a, double* y) {
+  vdAbs(n, a, y);
+}
+
+template <>
+void caffe_div<float>(const int n, const float* a, const float* b, float* y) {
   vsDiv(n, a, b, y);
 }
 
@@ -175,8 +262,7 @@ void caffe_div<double>(const int n, const double* a, const double* b,
 }
 
 template <>
-void caffe_powx<float>(const int n, const float* a, const float b,
-    float* y) {
+void caffe_powx<float>(const int n, const float* a, const float b, float* y) {
   vsPowx(n, a, b, y);
 }
 
@@ -206,84 +292,57 @@ void caffe_exp<double>(const int n, const double* a, double* y) {
   vdExp(n, a, y);
 }
 
-template <>
-void caffe_log<float>(const int n, const float* a, float* y) {
-  vsLn(n, a, y);
-}
-
-template <>
-void caffe_log<double>(const int n, const double* a, double* y) {
-  vdLn(n, a, y);
-}
-
-template <>
-void caffe_abs<float>(const int n, const float* a, float* y) {
-    vsAbs(n, a, y);
-}
-
-template <>
-void caffe_abs<double>(const int n, const double* a, double* y) {
-    vdAbs(n, a, y);
-}
-
 unsigned int caffe_rng_rand() {
   return (*caffe_rng())();
 }
 
 template <typename Dtype>
 Dtype caffe_nextafter(const Dtype b) {
-  return boost::math::nextafter<Dtype>(
-      b, std::numeric_limits<Dtype>::max());
+  return boost::math::nextafter < Dtype
+      > (b, std::numeric_limits < Dtype > ::max());
 }
-
-template
-float caffe_nextafter(const float b);
-
-template
-double caffe_nextafter(const double b);
+template float caffe_nextafter(const float b);
+template double caffe_nextafter(const double b);
 
 template <typename Dtype>
 void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
   CHECK_GE(n, 0);
   CHECK(r);
   CHECK_LE(a, b);
-  boost::uniform_real<Dtype> random_distribution(a, caffe_nextafter<Dtype>(b));
-  boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
+  boost::uniform_real < Dtype
+      > random_distribution(a, caffe_nextafter<Dtype>(b));
+  boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
   for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();
   }
-}
 
-template
-void caffe_rng_uniform<float>(const int n, const float a, const float b,
-                              float* r);
+  //LOG(INFO) << "caffe_rng_uniform";
+}
 
-template
-void caffe_rng_uniform<double>(const int n, const double a, const double b,
-                               double* r);
+template void caffe_rng_uniform<float>(const int n, const float a, const float b,
+    float* r);
+template void caffe_rng_uniform<double>(const int n, const double a, const double b,
+    double* r);
 
 template <typename Dtype>
-void caffe_rng_gaussian(const int n, const Dtype a,
-                        const Dtype sigma, Dtype* r) {
+void caffe_rng_gaussian(const int n, const Dtype a, const Dtype sigma,
+    Dtype* r) {
   CHECK_GE(n, 0);
   CHECK(r);
   CHECK_GT(sigma, 0);
-  boost::normal_distribution<Dtype> random_distribution(a, sigma);
-  boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
+  boost::normal_distribution < Dtype > random_distribution(a, sigma);
+  boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
   for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();
   }
 }
 
-template
-void caffe_rng_gaussian<float>(const int n, const float mu,
-                               const float sigma, float* r);
-
-template
-void caffe_rng_gaussian<double>(const int n, const double mu,
-                                const double sigma, double* r);
+template void caffe_rng_gaussian<float>(const int n, const float mu, const float sigma,
+    float* r);
+template void caffe_rng_gaussian<double>(const int n, const double mu,
+    const double sigma, double* r);
 
 template <typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
@@ -291,19 +350,16 @@ void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
   CHECK(r);
   CHECK_GE(p, 0);
   CHECK_LE(p, 1);
-  boost::bernoulli_distribution<Dtype> random_distribution(p);
-  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
+  boost::bernoulli_distribution < Dtype > random_distribution(p);
+  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
   for (int i = 0; i < n; ++i) {
     r[i] = variate_generator();
   }
 }
 
-template
-void caffe_rng_bernoulli<double>(const int n, const double p, int* r);
-
-template
-void caffe_rng_bernoulli<float>(const int n, const float p, int* r);
+template void caffe_rng_bernoulli<double>(const int n, const double p, int* r);
+template void caffe_rng_bernoulli<float>(const int n, const float p, int* r);
 
 template <typename Dtype>
 void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
@@ -311,61 +367,45 @@ void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
   CHECK(r);
   CHECK_GE(p, 0);
   CHECK_LE(p, 1);
-  boost::bernoulli_distribution<Dtype> random_distribution(p);
-  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
-      variate_generator(caffe_rng(), random_distribution);
+  boost::bernoulli_distribution < Dtype > random_distribution(p);
+  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> > variate_generator(
+      caffe_rng(), random_distribution);
   for (int i = 0; i < n; ++i) {
     r[i] = static_cast<unsigned int>(variate_generator());
   }
 }
 
-template
-void caffe_rng_bernoulli<double>(const int n, const double p, unsigned int* r);
-
-template
-void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
+template void caffe_rng_bernoulli<double>(const int n, const double p, unsigned int* r);
+template void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
 
 template <>
-float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
-    const float* y, const int incy) {
-  return cblas_sdot(n, x, incx, y, incy);
+float caffe_cpu_dot<float>(const int n, const float* x, const float* y) {
+  return cblas_sdot(n, x, 1, y, 1);
 }
 
 template <>
-double caffe_cpu_strided_dot<double>(const int n, const double* x,
-    const int incx, const double* y, const int incy) {
-  return cblas_ddot(n, x, incx, y, incy);
-}
-
-template <typename Dtype>
-Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) {
-  return caffe_cpu_strided_dot(n, x, 1, y, 1);
+double caffe_cpu_dot<double>(const int n, const double* x, const double* y) {
+  return cblas_ddot(n, x, 1, y, 1);
 }
 
-template
-float caffe_cpu_dot<float>(const int n, const float* x, const float* y);
-
-template
-double caffe_cpu_dot<double>(const int n, const double* x, const double* y);
-
 template <>
 int caffe_cpu_hamming_distance<float>(const int n, const float* x,
-                                  const float* y) {
+    const float* y) {
   int dist = 0;
   for (int i = 0; i < n; ++i) {
-    dist += __builtin_popcount(static_cast<uint32_t>(x[i]) ^
-                               static_cast<uint32_t>(y[i]));
+    dist += __builtin_popcount(
+        static_cast<uint32_t>(x[i]) ^ static_cast<uint32_t>(y[i]));
   }
   return dist;
 }
 
 template <>
 int caffe_cpu_hamming_distance<double>(const int n, const double* x,
-                                   const double* y) {
+    const double* y) {
   int dist = 0;
   for (int i = 0; i < n; ++i) {
-    dist += __builtin_popcountl(static_cast<uint64_t>(x[i]) ^
-                                static_cast<uint64_t>(y[i]));
+    dist += __builtin_popcountl(
+        static_cast<uint64_t>(x[i]) ^ static_cast<uint64_t>(y[i]));
   }
   return dist;
 }
@@ -380,18 +420,637 @@ double caffe_cpu_asum<double>(const int n, const double* x) {
   return cblas_dasum(n, x, 1);
 }
 
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sign);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sgnbit);
+INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs);
+
 template <>
 void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
-                            float* y) {
+    float* y) {
   cblas_scopy(n, x, 1, y, 1);
   cblas_sscal(n, alpha, y, 1);
 }
 
 template <>
 void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
-                             double* y) {
+    double* y) {
   cblas_dcopy(n, x, 1, y, 1);
   cblas_dscal(n, alpha, y, 1);
 }
 
+#ifndef CPU_ONLY
+//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
+//  - (x[index] < Dtype(0)));
+//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
+
+template <>
+void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CLBLAS_CHECK(
+      clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+          (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C,
+          0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template <>
+void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CLBLAS_CHECK(
+      clblasDgemm(amdDevice.col, transB, transA, N, M, K,  alpha,
+          (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda,  beta, (cl_mem) C,
+          0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template <>
+cl_event caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const int offA, const float* B,
+    const int offB, const float beta, float* C, const int offC) {
+  cl_event event;
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CLBLAS_CHECK(
+      clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+          (cl_mem) C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL,
+          &event));
+  return event;
+}
+
+template <>
+cl_event caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int offA, const double* B,
+    const int offB, const double beta, double* C, const int offC) {
+  cl_event event;
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CLBLAS_CHECK(
+      clblasDgemm(amdDevice.col, transB, transA, N, M, K, alpha,
+          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, beta,
+          (cl_mem) C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL,
+          &event));
+  return event;
+}
+
+template <>
+cl_event caffe_gpu_gemm<float>(cl_command_queue *queue,
+    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M,
+    const int N, const int K, const float alpha, const float* A, const int offA,
+    const float* B, const int offB, const float beta, float* C,
+    const int offC) {
+  cl_event event;
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CLBLAS_CHECK(
+      clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha,
+          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta,
+          (cl_mem) C, offC, ldc, 1, queue, 0, NULL, &event));
+  return event;
+}
+
+template <>
+cl_event caffe_gpu_gemm<double>(cl_command_queue *queue,
+    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M,
+    const int N, const int K, const double alpha, const double* A,
+    const int offA, const double* B, const int offB, const double beta,
+    double* C, const int offC) {
+  cl_event event;
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  clblasTranspose transB =
+      (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CLBLAS_CHECK(
+      clblasDgemm(amdDevice.col, transB, transA, N, M, K,  alpha,
+          (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, beta,
+          (cl_mem) C, offC, ldc, 1, queue, 0, NULL, &event));
+  return event;
+}
+
+template <>
+void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const float alpha, const float* A, size_t offA, int lda,
+    const float* x, size_t offx, const float beta, int incx, float* y,
+    size_t offy, int incy) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  CLBLAS_CHECK(
+      clblasSgemv(amdDevice.row, transA, M, N, (cl_float) alpha, (cl_mem) A,
+          offA, lda, (cl_mem) x, offx, incx, (cl_float) beta, (cl_mem) y, offy,
+          incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template <>
+void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const double alpha, const double* A, size_t offA, int lda,
+    const double* x, size_t offx, const double beta, int incx, double* y,
+    size_t offy, int incy) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  CLBLAS_CHECK(
+      clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A,
+          offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy,
+          incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template <>
+void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const float alpha, const float* A, const float* x,
+    const float beta, float* y) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  CLBLAS_CHECK(
+      clblasSgemv(amdDevice.row, transA, M, N, (cl_float) alpha, (cl_mem) A, 0,
+          N, (cl_mem) x, 0, 1, (cl_float) beta, (cl_mem) y, 0, 1, 1,
+          &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template <>
+void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const double alpha, const double* A, const double* x,
+    const double beta, double* y) {
+  clblasTranspose transA =
+      (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans;
+  CLBLAS_CHECK(
+      clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0,
+          N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1,
+          &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template <>
+void caffe_gpu_axpy<float>(const int N, const float alpha, const float* X,
+    float* Y) {
+  CLBLAS_CHECK(
+      clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+          &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template <>
+void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X,
+    double* Y) {
+  CLBLAS_CHECK(
+      clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1,
+          &(amdDevice.CommandQueue), 0, NULL, NULL));
+}
+
+template <>
+void caffe_gpu_sgnbit<float>(const int n, const float* x, float* y) {
+  caffe_gpu_signbit(n, x, y);
+}
+
+template <>
+void caffe_gpu_sgnbit<double>(const int n, const double* x, double* y) {
+  caffe_gpu_signbit(n, x, y);
+}
+
+template <>
+void caffe_gpu_abs<float>(const int n, const float* x, float* y) {
+  caffe_gpu_abs_ocl(n, x, y);
+}
+
+template <>
+void caffe_gpu_abs<double>(const int n, const double* x, double* y) {
+  caffe_gpu_abs_ocl(n, x, y);
+}
+
+void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) {
+  clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0,
+      NULL, NULL);
+}
+template <>
+void caffe_gpu_memcpy<float>(const size_t N, const float* X, float* Y) {
+  OCL_CHECK(
+      clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
+          N, 0, NULL, NULL));
+}
+
+template <>
+void caffe_gpu_memcpy<double>(const size_t N, const double* X, double* Y) {
+  OCL_CHECK(
+      clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
+          N, 0, NULL, NULL));
+}
+
+template <typename Dtype>
+void caffe_gpu_copy(const int N, const Dtype* X, Dtype* Y) {
+  if (X != Y) {
+     OCL_CHECK(
+       clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0,
+          N * sizeof(Dtype), 0, NULL, NULL));
+  }
+}
+template void caffe_gpu_copy<float>(const int N, const float* X, float* Y);
+template void caffe_gpu_copy<double>(const int N, const double* X, double* Y);
+template void caffe_gpu_copy<int>(const int N, const int* X, int* Y);
+template void caffe_gpu_copy<unsigned int>(const int N, const unsigned int* X, unsigned int* Y);
+
+template <>
+void caffe_gpu_copy<float>(const int N, const float* X, const int offx, float* Y, const int offy) {
+  if (X != Y) {
+    CLBLAS_CHECK(
+        clblasScopy(N, (cl_mem) X, offx, 1, (cl_mem) Y, offy, 1, 1,
+            &(amdDevice.CommandQueue), 0, NULL, NULL));
+  }
+}
+
+template <>
+void caffe_gpu_copy<double>(const int N, const double* X, const int offx, double* Y, const int offy) {
+  if (X != Y) {
+    CLBLAS_CHECK(
+        clblasDcopy(N, (cl_mem) X, offx, 1, (cl_mem) Y, offy, 1, 1,
+            &(amdDevice.CommandQueue), 0, NULL, NULL));
+  }
+}
+
+template <>
+void caffe_gpu_scal<float>(const int N, const float alpha, float *X, const int offx) {
+  CLBLAS_CHECK(
+      clblasSscal(N, alpha, (cl_mem) X, offx, 1, 1, &(amdDevice.CommandQueue), 0,
+          NULL, NULL));
+}
+
+template <>
+void caffe_gpu_scal<double>(const int N, const double alpha, double *X, const int offx) {
+  CLBLAS_CHECK(
+      clblasDscal(N, alpha, (cl_mem) X, offx, 1, 1, &(amdDevice.CommandQueue), 0,
+          NULL, NULL));
+}
+
+template <>
+void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
+    const float beta, float* Y) {
+  caffe_gpu_scal<float>(N, beta, Y);
+  caffe_gpu_axpy<float>(N, alpha, X, Y);
+}
+
+template <>
+void caffe_gpu_axpby<double>(const int N, const double alpha, const double* X,
+    const double beta, double* Y) {
+  caffe_gpu_scal<double>(N, beta, Y);
+  caffe_gpu_axpy<double>(N, alpha, X, Y);
+}
+
+template <>
+void caffe_gpu_dot<float>(const int n, const float* x, const float* y,
+    float* out) {
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(float)), NULL, NULL);
+  cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(float)), NULL, NULL);
+  clblasSdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float),
+      out, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_out);
+}
+
+template <>
+void caffe_gpu_dot<double>(const int n, const double* x, const double* y,
+    double * out) {
+  //need to pass in scratchBuff
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(double)), NULL, NULL);
+  cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(double)), NULL, NULL);
+  clblasDdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double),
+      out, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_out);
+}
+
+template <>
+void caffe_gpu_dot<float>(const int n, const float* x, size_t offx, const float* y, size_t offy, float* out) {
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(float)), NULL, NULL);
+  cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(float)), NULL, NULL);
+  clblasSdot(n, d_out, 0, (cl_mem) x, offx, 1, (cl_mem) y, offy, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float),
+      out, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_out);
+}
+
+template <>
+void caffe_gpu_dot<double>(const int n, const double* x, size_t offx, const double* y, size_t offy, double * out) {
+  //need to pass in scratchBuff
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(double)), NULL, NULL);
+  cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(double)), NULL, NULL);
+  clblasDdot(n, d_out, 0, (cl_mem) x, offx, 1, (cl_mem) y, offy, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double),
+      out, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_out);
+}
+
+template <>
+void caffe_gpu_asum<float>(const int n, const float* x, float* y) {
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(cl_float)), NULL, NULL);
+  cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(cl_float)), NULL, NULL);
+  clblasSasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y,
+      0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_y);
+}
+
+template <>
+void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(cl_double)), NULL, NULL);
+  cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(cl_double)), NULL, NULL);
+  clblasDasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double),
+      y, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_y);
+}
+
+template <>
+void caffe_gpu_asum<float>(const int n, const float* x, size_t offx, float* y) {
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(cl_float)), NULL, NULL);
+  cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(cl_float)), NULL, NULL);
+  clblasSasum(n, d_y, 0, (cl_mem) x, offx, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y,
+      0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_y);
+}
+
+template <>
+void caffe_gpu_asum<double>(const int n, const double* x, size_t offx, double* y) {
+  cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (n * sizeof(cl_double)), NULL, NULL);
+  cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE,
+      (1 * sizeof(cl_double)), NULL, NULL);
+  clblasDasum(n, d_y, 0, (cl_mem) x, offx, 1, scratchBuff, 1,
+      &(amdDevice.CommandQueue), 0, NULL, NULL);
+  clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double),
+      y, 0, NULL, NULL);
+  clReleaseMemObject(scratchBuff);
+  clReleaseMemObject(d_y);
+}
+
+
+template <>
+void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
+    float* y) {
+  caffe_gpu_copy(n, x, y);
+  caffe_gpu_scal(n, alpha, y);
+}
+
+template <>
+void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
+    double* y) {
+  caffe_gpu_copy(n, x, y);
+  caffe_gpu_scal(n, alpha, y);
+}
+
+template <>
+void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
+    const int offx, float* y, const int offy) {
+  caffe_gpu_copy(n, x, offx, y, offy);
+  caffe_gpu_scal(n, alpha, y, offy);
+}
+
+template <>
+void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
+    const int offx, double* y, const int offy) {
+  caffe_gpu_copy(n, x, offx, y, offy);
+  caffe_gpu_scal(n, alpha, y, offy);
+}
+
+template <typename Dtype>
+void set_kernel(const int n, const Dtype alpha, Dtype* y) {
+  NOT_IMPLEMENTED;
+}
+
+template <>
+void caffe_gpu_set<float>(const int N, const float alpha, float* Y, const int offy) {
+  ocl_memset(Y, alpha, N, offy);
+}
+
+template <>
+void caffe_gpu_set<double>(const int N, const double alpha, double* Y, const int offy) {
+  ocl_memset(Y, alpha, N, offy);
+}
+
+template <>
+void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
+  kernel_add_scalar(N, alpha, Y);
+}
+
+template <>
+void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
+  kernel_add_scalar(N, alpha, Y);
+}
+
+template <>
+void caffe_gpu_exp<float>(const int N, const float* a, float* y) {
+  kernel_exp(N, a, y);
+}
+
+template <>
+void caffe_gpu_exp<double>(const int N, const double* a, double* y) {
+  kernel_exp(N, a, y);
+}
+
+template <>
+void caffe_gpu_sign<float>(const int N, const float *X, float *Y) {
+  caffe_gpu_sign_ocl(N, X, Y);
+}
+
+
+template <>
+void caffe_gpu_sign<double>(const int N, const double *X, double *Y) {
+  caffe_gpu_sign_ocl(N, X, Y);
+}
+
+template <>
+void caffe_gpu_sign<float>(const int N, const float *X, const int offx, float *Y, const int offy) {
+  caffe_gpu_sign_with_offset_ocl(N, X, offx, Y, offy);
+}
+
+
+template <>
+void caffe_gpu_sign<double>(const int N, const double *X, const int offx, double *Y, const int offy) {
+  caffe_gpu_sign_with_offset_ocl(N, X, offx, Y, offy);
+}
+
+template <>
+void caffe_gpu_sub<float>(const int N, const float* a, const float* b,
+    float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_sub(N, a, b, y);
+}
+
+template <>
+void caffe_gpu_sub<double>(const int N, const double* a, const double* b,
+    double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_sub(N, a, b, y);
+}
+
+template <>
+void caffe_gpu_mul<float>(const int N, const float* a, const float* b,
+    float* y) {
+  kernel_mul(N, a, b, y);
+}
+
+template <>
+void caffe_gpu_mul<double>(const int N, const double* a, const double* b,
+    double* y) {
+  kernel_mul(N, a, b, y);
+}
+
+template <>
+void caffe_gpu_div<float>(const int N, const float* a, const float* b,
+    float* y) {
+  kernel_div(N, a, b, y);
+}
+
+template <>
+void caffe_gpu_div<double>(const int N, const double* a, const double* b,
+    double* y) {
+  kernel_div(N, a, b, y);
+}
+
+template <>
+void caffe_gpu_powx<float>(const int N, const float* a, const float alpha,
+    float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_powx(N, a, alpha, y);
+}
+
+template <>
+void caffe_gpu_powx<double>(const int N, const double* a, const double alpha,
+    double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_powx(N, a, alpha, y);
+}
+
+void popc_kernel(const int n, const float* a, const float* b, uint8_t* y) {
+  NOT_IMPLEMENTED;
+}
+
+void popcll_kernel(const int n, const double* a, const double* b, uint8_t* y) {
+  NOT_IMPLEMENTED;
+}
+
+template <>
+uint32_t caffe_gpu_hamming_distance<float>(const int n, const float* x,
+    const float* y) {
+  NOT_IMPLEMENTED;
+}
+
+template <>
+uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
+    const double* y) {
+  NOT_IMPLEMENTED;
+}
+
+void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
+        caffe_gpu_uniform(n, r);
+}
+
+template <>
+void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,
+    float* r) {
+  caffe_gpu_uniform(r, n, a, b);	// r is a cl_mem object
+}
+template <>
+void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
+    double* r) {
+  caffe_gpu_uniform(r, n, a, b);  // r is a cl_mem object
+}
+
+template <>
+void caffe_gpu_rng_gaussian<float>(const int n, const float mu,
+    const float sigma, float* r) {
+  caffe_gpu_gaussian(r, n, mu, sigma);  // r is a cl_mem object
+}
+
+template <>
+void caffe_gpu_rng_gaussian<double>(const int n, const double mu,
+    const double sigma, double* r) {
+  caffe_gpu_gaussian(r, n, mu, sigma);  // r is a cl_mem object
+}
+
+template <>
+void caffe_gpu_log<float>(const int N, const float* a, float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_log(N, a, y);
+}
+
+template <>
+void caffe_gpu_log<double>(const int N, const double* a, double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_log(N, a, y);
+}
+
+template <>
+void caffe_gpu_add<float>(const int N, const float* a, const float* b,
+    float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_add(N, a, b, y);
+}
+
+template <>
+void caffe_gpu_add<double>(const int N, const double* a, const double* b,
+    double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  kernel_add(N, a, b, y);
+}
+#endif
 }  // namespace caffe
diff --git a/src/caffe/util/math_functions.cpp.protect b/src/caffe/util/math_functions.cpp.protect
new file mode 100644
index 00000000..166b709a
--- /dev/null
+++ b/src/caffe/util/math_functions.cpp.protect
@@ -0,0 +1,413 @@
+#include <boost/math/special_functions/next.hpp>
+#include <boost/random.hpp>
+
+#include <limits>
+#include <clBLAS.h>
+
+#include "caffe/common.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/util/rng.hpp"
+
+
+namespace caffe {
+
+template<>
+void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+      ldb, beta, C, N);
+}
+
+template<>
+void caffe_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
+      ldb, beta, C, N);
+}
+
+template <>
+void caffe_cpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const float alpha, const float* A, const float* x,
+    const float beta, float* y) {
+  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+}
+
+template <>
+void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
+    const int N, const double alpha, const double* A, const double* x,
+    const double beta, double* y) {
+  cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+}
+
+template <>
+void caffe_axpy<float>(const int N, const float alpha, const float* X,
+    float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); }
+
+template <>
+void caffe_axpy<double>(const int N, const double alpha, const double* X,
+    double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); }
+
+template <typename Dtype>
+void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
+  if (alpha == 0) {
+    memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    return;
+  }
+  for (int i = 0; i < N; ++i) {
+    Y[i] = alpha;
+  }
+}
+
+template void caffe_set<int>(const int N, const int alpha, int* Y);
+template void caffe_set<float>(const int N, const float alpha, float* Y);
+template void caffe_set<double>(const int N, const double alpha, double* Y);
+
+template <>
+void caffe_add_scalar(const int N, const float alpha, float* Y) {
+  for (int i = 0; i < N; ++i) {
+    Y[i] += alpha;
+  }
+}
+
+template <>
+void caffe_add_scalar(const int N, const double alpha, double* Y) {
+  for (int i = 0; i < N; ++i) {
+    Y[i] += alpha;
+  }
+}
+
+template <typename Dtype>
+void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
+  if (X != Y) {
+    if (Caffe::mode() == Caffe::GPU) {
+#ifndef CPU_ONLY
+      // NOLINT_NEXT_LINE(caffe/alt_fn)
+      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
+#else
+      NO_GPU;
+#endif
+    } else {
+      memcpy(Y, X, sizeof(Dtype) * N);  // NOLINT(caffe/alt_fn)
+    }
+  }
+}
+
+template void caffe_copy<int>(const int N, const int* X, int* Y);
+template void caffe_copy<unsigned int>(const int N, const unsigned int* X,
+    unsigned int* Y);
+template void caffe_copy<float>(const int N, const float* X, float* Y);
+template void caffe_copy<double>(const int N, const double* X, double* Y);
+
+template <>
+void caffe_scal<float>(const int N, const float alpha, float *X) {
+  cblas_sscal(N, alpha, X, 1);
+}
+
+template <>
+void caffe_scal<double>(const int N, const double alpha, double *X) {
+  cblas_dscal(N, alpha, X, 1);
+}
+
+template <>
+void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
+                            const float beta, float* Y) {
+  cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
+}
+
+template <>
+void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
+                             const double beta, double* Y) {
+  cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
+}
+
+template <>
+void caffe_add<float>(const int n, const float* a, const float* b,
+    float* y) {
+  vsAdd(n, a, b, y);
+}
+
+template <>
+void caffe_add<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdAdd(n, a, b, y);
+}
+
+template <>
+void caffe_sub<float>(const int n, const float* a, const float* b,
+    float* y) {
+  vsSub(n, a, b, y);
+}
+
+template <>
+void caffe_sub<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdSub(n, a, b, y);
+}
+
+template <>
+void caffe_mul<float>(const int n, const float* a, const float* b,
+    float* y) {
+  vsMul(n, a, b, y);
+}
+
+template <>
+void caffe_mul<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdMul(n, a, b, y);
+}
+
+template <>
+void caffe_div<float>(const int n, const float* a, const float* b,
+    float* y) {
+  vsDiv(n, a, b, y);
+}
+
+template <>
+void caffe_div<double>(const int n, const double* a, const double* b,
+    double* y) {
+  vdDiv(n, a, b, y);
+}
+
+template <>
+void caffe_powx<float>(const int n, const float* a, const float b,
+    float* y) {
+  vsPowx(n, a, b, y);
+}
+
+template <>
+void caffe_powx<double>(const int n, const double* a, const double b,
+    double* y) {
+  vdPowx(n, a, b, y);
+}
+
+template <>
+void caffe_sqr<float>(const int n, const float* a, float* y) {
+  vsSqr(n, a, y);
+}
+
+template <>
+void caffe_sqr<double>(const int n, const double* a, double* y) {
+  vdSqr(n, a, y);
+}
+
+template <>
+void caffe_exp<float>(const int n, const float* a, float* y) {
+  vsExp(n, a, y);
+}
+
+template <>
+void caffe_exp<double>(const int n, const double* a, double* y) {
+  vdExp(n, a, y);
+}
+
+template <>
+void caffe_log<float>(const int n, const float* a, float* y) {
+  vsLn(n, a, y);
+}
+
+template <>
+void caffe_log<double>(const int n, const double* a, double* y) {
+  vdLn(n, a, y);
+}
+
+template <>
+void caffe_abs<float>(const int n, const float* a, float* y) {
+    vsAbs(n, a, y);
+}
+
+template <>
+void caffe_abs<double>(const int n, const double* a, double* y) {
+    vdAbs(n, a, y);
+}
+
+unsigned int caffe_rng_rand() {
+  return (*caffe_rng())();
+}
+
+template <typename Dtype>
+Dtype caffe_nextafter(const Dtype b) {
+  return boost::math::nextafter<Dtype>(
+      b, std::numeric_limits<Dtype>::max());
+}
+
+template
+float caffe_nextafter(const float b);
+
+template
+double caffe_nextafter(const double b);
+
+template <typename Dtype>
+void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_LE(a, b);
+  boost::uniform_real<Dtype> random_distribution(a, caffe_nextafter<Dtype>(b));
+  boost::variate_generator<caffe::rng_t*, boost::uniform_real<Dtype> >
+      variate_generator(caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+}
+
+template
+void caffe_rng_uniform<float>(const int n, const float a, const float b,
+                              float* r);
+
+template
+void caffe_rng_uniform<double>(const int n, const double a, const double b,
+                               double* r);
+
+template <typename Dtype>
+void caffe_rng_gaussian(const int n, const Dtype a,
+                        const Dtype sigma, Dtype* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GT(sigma, 0);
+  boost::normal_distribution<Dtype> random_distribution(a, sigma);
+  boost::variate_generator<caffe::rng_t*, boost::normal_distribution<Dtype> >
+      variate_generator(caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+}
+
+template
+void caffe_rng_gaussian<float>(const int n, const float mu,
+                               const float sigma, float* r);
+
+template
+void caffe_rng_gaussian<double>(const int n, const double mu,
+                                const double sigma, double* r);
+
+template <typename Dtype>
+void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
+  boost::bernoulli_distribution<Dtype> random_distribution(p);
+  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
+      variate_generator(caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = variate_generator();
+  }
+}
+
+template
+void caffe_rng_bernoulli<double>(const int n, const double p, int* r);
+
+template
+void caffe_rng_bernoulli<float>(const int n, const float p, int* r);
+
+template <typename Dtype>
+void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
+  CHECK_GE(n, 0);
+  CHECK(r);
+  CHECK_GE(p, 0);
+  CHECK_LE(p, 1);
+  boost::bernoulli_distribution<Dtype> random_distribution(p);
+  boost::variate_generator<caffe::rng_t*, boost::bernoulli_distribution<Dtype> >
+      variate_generator(caffe_rng(), random_distribution);
+  for (int i = 0; i < n; ++i) {
+    r[i] = static_cast<unsigned int>(variate_generator());
+  }
+}
+
+template
+void caffe_rng_bernoulli<double>(const int n, const double p, unsigned int* r);
+
+template
+void caffe_rng_bernoulli<float>(const int n, const float p, unsigned int* r);
+
+template <>
+float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
+    const float* y, const int incy) {
+  return cblas_sdot(n, x, incx, y, incy);
+}
+
+template <>
+double caffe_cpu_strided_dot<double>(const int n, const double* x,
+    const int incx, const double* y, const int incy) {
+  return cblas_ddot(n, x, incx, y, incy);
+}
+
+template <typename Dtype>
+Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) {
+  return caffe_cpu_strided_dot(n, x, 1, y, 1);
+}
+
+template
+float caffe_cpu_dot<float>(const int n, const float* x, const float* y);
+
+template
+double caffe_cpu_dot<double>(const int n, const double* x, const double* y);
+
+template <>
+int caffe_cpu_hamming_distance<float>(const int n, const float* x,
+                                  const float* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcount(static_cast<uint32_t>(x[i]) ^
+                               static_cast<uint32_t>(y[i]));
+  }
+  return dist;
+}
+
+template <>
+int caffe_cpu_hamming_distance<double>(const int n, const double* x,
+                                   const double* y) {
+  int dist = 0;
+  for (int i = 0; i < n; ++i) {
+    dist += __builtin_popcountl(static_cast<uint64_t>(x[i]) ^
+                                static_cast<uint64_t>(y[i]));
+  }
+  return dist;
+}
+
+template <>
+float caffe_cpu_asum<float>(const int n, const float* x) {
+  return cblas_sasum(n, x, 1);
+}
+
+template <>
+double caffe_cpu_asum<double>(const int n, const double* x) {
+  return cblas_dasum(n, x, 1);
+}
+
+template <>
+void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
+                            float* y) {
+  cblas_scopy(n, x, 1, y, 1);
+  cblas_sscal(n, alpha, y, 1);
+}
+
+template <>
+void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
+                             double* y) {
+  cblas_dcopy(n, x, 1, y, 1);
+  cblas_dscal(n, alpha, y, 1);
+}
+
+template <>
+void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
+    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+    clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans;
+    int lda = (TransA == CblasNoTrans) ? K : M;
+    int ldb = (TransB == CblasNoTrans) ? N : K;
+    int ldc = N;
+    //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+    CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) );
+}
+
+}  // namespace caffe
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 2631a074..ae71de0f 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -24,8 +24,9 @@ void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
       (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
       (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
-      N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+  CUBLAS_CHECK(
+      cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+          B, ldb, A, lda, &beta, C, N));
 }
 
 template <>
@@ -40,8 +41,9 @@ void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
       (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
       (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
-      N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
+  CUBLAS_CHECK(
+      cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+          B, ldb, A, lda, &beta, C, N));
 }
 
 template <>
@@ -50,8 +52,9 @@ void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
     const float beta, float* y) {
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
-      A, N, x, 1, &beta, y, 1));
+  CUBLAS_CHECK(
+      cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1,
+          &beta, y, 1));
 }
 
 template <>
@@ -60,8 +63,9 @@ void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
     const double beta, double* y) {
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
-      A, N, x, 1, &beta, y, 1));
+  CUBLAS_CHECK(
+      cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1,
+          &beta, y, 1));
 }
 
 template <>
@@ -130,14 +134,14 @@ void caffe_gpu_asum<double>(const int n, const double* x, double* y) {
 
 template <>
 void caffe_gpu_scale<float>(const int n, const float alpha, const float *x,
-                            float* y) {
+    float* y) {
   CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
   CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
 }
 
 template <>
 void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
-                             double* y) {
+    double* y) {
   CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1));
   CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1));
 }
@@ -156,8 +160,8 @@ void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) {
     return;
   }
   // NOLINT_NEXT_LINE(whitespace/operators)
-  set_kernel<Dtype><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, alpha, Y);
+set_kernel<Dtype><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+    N, alpha, Y);
 }
 
 template void caffe_gpu_set<int>(const int N, const int alpha, int* Y);
@@ -166,300 +170,300 @@ template void caffe_gpu_set<double>(const int N, const double alpha, double* Y);
 
 template <typename Dtype>
 __global__ void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] += alpha;
-  }
+CUDA_KERNEL_LOOP(index, n) {
+  y[index] += alpha;
+}
 }
 
 template <>
 void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
-  // NOLINT_NEXT_LINE(whitespace/operators)
-  add_scalar_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, alpha, Y);
+// NOLINT_NEXT_LINE(whitespace/operators)
+add_scalar_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+  N, alpha, Y);
 }
 
 template <>
 void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  add_scalar_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, alpha, Y);
+add_scalar_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, alpha, Y);
 }
 
 template <typename Dtype>
 __global__ void add_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] + b[index];
-  }
+const Dtype* b, Dtype* y) {
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = a[index] + b[index];
+}
 }
 
 template <>
 void caffe_gpu_add<float>(const int N, const float* a, const float* b,
-    float* y) {
+float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  add_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+add_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <>
 void caffe_gpu_add<double>(const int N, const double* a, const double* b,
-    double* y) {
+double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  add_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+add_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <typename Dtype>
 __global__ void sub_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] - b[index];
-  }
+const Dtype* b, Dtype* y) {
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = a[index] - b[index];
+}
 }
 
 template <>
 void caffe_gpu_sub<float>(const int N, const float* a, const float* b,
-    float* y) {
+float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  sub_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+sub_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <>
 void caffe_gpu_sub<double>(const int N, const double* a, const double* b,
-    double* y) {
+double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  sub_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+sub_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <typename Dtype>
 __global__ void mul_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] * b[index];
-  }
+const Dtype* b, Dtype* y) {
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = a[index] * b[index];
+}
 }
 
 template <>
-void caffe_gpu_mul<float>(const int N, const float* a,
-    const float* b, float* y) {
+void caffe_gpu_mul<float>(const int N, const float* a, const float* b,
+float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  mul_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+mul_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <>
-void caffe_gpu_mul<double>(const int N, const double* a,
-    const double* b, double* y) {
+void caffe_gpu_mul<double>(const int N, const double* a, const double* b,
+double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  mul_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+mul_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <typename Dtype>
 __global__ void div_kernel(const int n, const Dtype* a,
-    const Dtype* b, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = a[index] / b[index];
-  }
+const Dtype* b, Dtype* y) {
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = a[index] / b[index];
+}
 }
 
 template <>
-void caffe_gpu_div<float>(const int N, const float* a,
-    const float* b, float* y) {
+void caffe_gpu_div<float>(const int N, const float* a, const float* b,
+float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  div_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+div_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <>
-void caffe_gpu_div<double>(const int N, const double* a,
-    const double* b, double* y) {
+void caffe_gpu_div<double>(const int N, const double* a, const double* b,
+double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  div_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, b, y);
+div_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, b, y);
 }
 
 template <typename Dtype>
 __global__ void abs_kernel(const int n, const Dtype* a, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = abs(a[index]);
-  }
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = abs(a[index]);
+}
 }
 
 template <>
 void caffe_gpu_abs<float>(const int N, const float* a, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  abs_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+abs_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
 template <>
 void caffe_gpu_abs<double>(const int N, const double* a, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  abs_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+abs_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
-
 template <typename Dtype>
 __global__ void exp_kernel(const int n, const Dtype* a, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = exp(a[index]);
-  }
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = exp(a[index]);
+}
 }
 
 template <>
 void caffe_gpu_exp<float>(const int N, const float* a, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  exp_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+exp_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
 template <>
 void caffe_gpu_exp<double>(const int N, const double* a, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  exp_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+exp_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
 template <typename Dtype>
 __global__ void log_kernel(const int n, const Dtype* a, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = log(a[index]);
-  }
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = log(a[index]);
+}
 }
 
 template <>
 void caffe_gpu_log<float>(const int N, const float* a, float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  log_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+log_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
 template <>
 void caffe_gpu_log<double>(const int N, const double* a, double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  log_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, y);
+log_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, y);
 }
 
 template <typename Dtype>
 __global__ void powx_kernel(const int n, const Dtype* a,
-    const Dtype alpha, Dtype* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = pow(a[index], alpha);
-  }
+const Dtype alpha, Dtype* y) {
+CUDA_KERNEL_LOOP(index, n) {
+y[index] = pow(a[index], alpha);
+}
 }
 
 template <>
-void caffe_gpu_powx<float>(const int N, const float* a,
-    const float alpha, float* y) {
+void caffe_gpu_powx<float>(const int N, const float* a, const float alpha,
+float* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  powx_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, alpha, y);
+powx_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, alpha, y);
 }
 
 template <>
-void caffe_gpu_powx<double>(const int N, const double* a,
-    const double alpha, double* y) {
+void caffe_gpu_powx<double>(const int N, const double* a, const double alpha,
+double* y) {
   // NOLINT_NEXT_LINE(whitespace/operators)
-  powx_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
-      N, a, alpha, y);
+powx_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+N, a, alpha, y);
 }
 
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
-                                      - (x[index] < Dtype(0)));
+- (x[index] < Dtype(0)));
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
 
-__global__ void popc_kernel(const int n, const float* a,
-    const float* b, uint8_t* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = __popc(static_cast<uint32_t>(a[index]) ^
-                      static_cast<uint32_t>(b[index]));
-  }
+__global__ void popc_kernel(const int n, const float* a, const float* b,
+uint8_t* y) {
+CUDA_KERNEL_LOOP(index, n)
+{
+y[index] = __popc(
+static_cast<uint32_t>(a[index]) ^ static_cast<uint32_t>(b[index]));
+}
 }
 
-__global__ void popcll_kernel(const int n, const double* a,
-    const double* b, uint8_t* y) {
-  CUDA_KERNEL_LOOP(index, n) {
-    y[index] = __popcll(static_cast<uint64_t>(a[index]) ^
-                      static_cast<uint64_t>(b[index]));
-  }
+__global__ void popcll_kernel(const int n, const double* a, const double* b,
+uint8_t* y) {
+CUDA_KERNEL_LOOP(index, n)
+{
+y[index] = __popcll(
+static_cast<uint64_t>(a[index]) ^ static_cast<uint64_t>(b[index]));
+}
 }
 
 template <>
 uint32_t caffe_gpu_hamming_distance<float>(const int n, const float* x,
-                                  const float* y) {
+const float* y) {
   // TODO: Fix caffe_gpu_hamming_distance (see failing unit test
   // TestHammingDistanceGPU in test_math_functions.cpp).
-  NOT_IMPLEMENTED;
-  thrust::device_vector<uint8_t> popcounts(n);
+NOT_IMPLEMENTED;
+thrust::device_vector < uint8_t > popcounts(n);
   // NOLINT_NEXT_LINE(whitespace/operators)
-  popc_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
-      n, x, y, thrust::raw_pointer_cast(popcounts.data()));
-  return thrust::reduce(popcounts.begin(), popcounts.end(),
-                        (uint32_t) 0, thrust::plus<uint32_t>());
+popc_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
+n, x, y, thrust::raw_pointer_cast(popcounts.data()));
+return thrust::reduce(popcounts.begin(), popcounts.end(), (uint32_t) 0,
+thrust::plus<uint32_t>());
 }
 
 template <>
 uint32_t caffe_gpu_hamming_distance<double>(const int n, const double* x,
-                                   const double* y) {
+const double* y) {
   // TODO: Fix caffe_gpu_hamming_distance (see failing unit test
   // TestHammingDistanceGPU in test_math_functions.cpp).
-  NOT_IMPLEMENTED;
-  thrust::device_vector<uint8_t> popcounts(n);
+NOT_IMPLEMENTED;
+thrust::device_vector < uint8_t > popcounts(n);
   // NOLINT_NEXT_LINE(whitespace/operators)
-  popcll_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
-      n, x, y, thrust::raw_pointer_cast(popcounts.data()));
-  return thrust::reduce(popcounts.begin(), popcounts.end(),
-                        /* NOLINT_NEXT_LINE(build/include_what_you_use) */
-                        (uint32_t) 0, thrust::plus<uint32_t>());
+popcll_kernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(
+n, x, y, thrust::raw_pointer_cast(popcounts.data()));
+return thrust::reduce(popcounts.begin(), popcounts.end(),
+/* NOLINT_NEXT_LINE(build/include_what_you_use) */
+(uint32_t) 0, thrust::plus<uint32_t>());
 }
 
 void caffe_gpu_rng_uniform(const int n, unsigned int* r) {
-  CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n));
+CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n));
 }
 
 template <>
 void caffe_gpu_rng_uniform<float>(const int n, const float a, const float b,
-                                  float* r) {
-  CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n));
-  const float range = b - a;
-  if (range != static_cast<float>(1)) {
-    caffe_gpu_scal(n, range, r);
-  }
-  if (a != static_cast<float>(0)) {
-    caffe_gpu_add_scalar(n, a, r);
-  }
+float* r) {
+CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n));
+const float range = b - a;
+if (range != static_cast<float>(1)) {
+caffe_gpu_scal(n, range, r);
+}
+if (a != static_cast<float>(0)) {
+caffe_gpu_add_scalar(n, a, r);
+}
 }
 
 template <>
 void caffe_gpu_rng_uniform<double>(const int n, const double a, const double b,
-                                   double* r) {
-  CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n));
-  const double range = b - a;
-  if (range != static_cast<double>(1)) {
-    caffe_gpu_scal(n, range, r);
-  }
-  if (a != static_cast<double>(0)) {
-    caffe_gpu_add_scalar(n, a, r);
-  }
+double* r) {
+CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n));
+const double range = b - a;
+if (range != static_cast<double>(1)) {
+caffe_gpu_scal(n, range, r);
+}
+if (a != static_cast<double>(0)) {
+caffe_gpu_add_scalar(n, a, r);
+}
 }
 
 template <>
 void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma,
-                            float* r) {
-  CURAND_CHECK(
-      curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma));
+float* r) {
+CURAND_CHECK(curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma));
 }
 
 template <>
 void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma,
-                            double* r) {
-  CURAND_CHECK(
-      curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma));
+double* r) {
+CURAND_CHECK(
+curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma));
 }
 
 }  // namespace caffe
diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp
new file mode 100644
index 00000000..bc2aea35
--- /dev/null
+++ b/src/caffe/util/ocl_util.cpp
@@ -0,0 +1,96 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <stdlib.h>
+#include <stdio.h>
+#include "caffe/common.hpp"
+#include "caffe/util/ocl_util.hpp"
+namespace caffe {
+
+#ifndef CPU_ONLY
+
+template <typename dtype> extern std::string get_dtype_suffix();
+
+template <typename Dtype>
+void ocl_memset(Dtype* buffer, const Dtype value, const int count, const int buf_offset) {
+  std::string kernel_name = std::string("oclmem") + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int err = 0;
+  err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer);
+  err |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &value);
+  err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count);
+  err |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &buf_offset);
+  OCL_CHECK(err);
+
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+
+}
+
+template void ocl_memset<int>(int* buffer, const int value, const int count, const int buf_offset);
+template void ocl_memset<float>(float* buffer, const float value, const int count, const int buf_offset);
+template void ocl_memset<double>(double* buffer, const double value, const int count, const int buf_offset);
+
+void ocl_memset(cl_mem buffer, const int value,
+    const int count) {
+  std::string kernel_name = std::string("OCL_memset2");
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int err;
+  err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer);
+  err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value);
+  err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count);
+  OCL_CHECK(err);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+
+}
+
+void eventCallback(cl_event event, cl_int event_status, void* user_data) {
+  cl_ulong ev_start_time = (cl_ulong) 0;
+  cl_ulong ev_end_time = (cl_ulong) 0;
+  double run_time;
+  OCL_CHECK(
+      clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
+          sizeof(cl_ulong), &ev_start_time, NULL));
+  OCL_CHECK(
+      clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
+          &ev_end_time, NULL));
+  run_time = (double) (ev_end_time - ev_start_time);
+  printf("The kernel's running time is %f s\n", run_time * 1.0e-9);
+}
+
+#endif
+}  // namespace caffe
diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp
new file mode 100644
index 00000000..0b4cbf6f
--- /dev/null
+++ b/src/caffe/util/ocl_wrapper.cpp
@@ -0,0 +1,2017 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.  
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this 
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, 
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <stdlib.h>
+#include <stdio.h>
+#include "caffe/common.hpp"
+#include "caffe/util/ocl_util.hpp"
+#include "caffe/util/ocl_wrapper.hpp"
+namespace caffe {
+
+#ifndef CPU_ONLY
+typedef unsigned int uint32_t;
+struct array4x32 {
+    uint32_t v[4];
+};
+template <typename Dtype>
+void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup,
+    Dtype threshold) {
+  std::string kernel_name = "RNGBernoulli" + get_dtype_suffix<Dtype>();
+  cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+  static unsigned c = 0;
+  unsigned nrounds = 20;
+  array4x32 rndctr4;
+  rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+  cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+
+  cl_int ret;
+  ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a);
+  ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4);
+  ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &inf);
+  ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &sup);
+  ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype), (void*) &threshold);
+  ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &nrounds);
+  ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*) &size);
+  OCL_CHECK(ret);
+
+  size_t globalws[1] = { size };
+  size_t localws[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL,
+          globalws, localws, 0, NULL, NULL));
+}
+template void caffe_gpu_bernoulli<float>(int* a, const unsigned int n,
+    float inf, float sup, float threshold);
+template void caffe_gpu_bernoulli<double>(int* a, const unsigned int n,
+    double inf, double sup, double threshold);
+
+template <typename Dtype>
+void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_,
+    const int M_, const int packing_num) {
+  std::string kernel_name = "transform" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &src);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &dst);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &top_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &N_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &M_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &packing_num);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size2[] = { (size_t)(M_ * packing_num) };
+  size_t uiLocal_Work_Size2[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
+}
+
+template void transform_gpu<float>(float* src, float* dst, const int top_offset,
+    const int N_, const int M_, const int packing_num);
+template void transform_gpu<double>(double* src, double* dst,
+    const int top_offset, const int N_, const int M_, const int packing_num);
+
+template <typename Dtype>
+void get_max_gpu(cl_kernel Kernel, const int num, const int dim,
+    const Dtype* bottom_data, Dtype* scale_data) {
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &scale_data));
+
+  size_t Global_Work_Size[1] = { (size_t) num };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void get_max_gpu<float>(cl_kernel Kernel, const int num, const int dim,
+    const float* bottom_data, float* scale_data);
+template void get_max_gpu<double>(cl_kernel Kernel, const int num,
+		const int dim, const double* bottom_data, double* scale_data);
+
+template <typename Dtype>
+void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup, unsigned int seed_)
+{
+        static unsigned c = 0;
+        if ((n == 0) || (a == NULL)) {
+            c = seed_;
+            return;
+        }
+	std::string kernel_name = "RNGUniform" + get_dtype_suffix<Dtype>();
+        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+        unsigned nrounds = 20;
+        array4x32  rndctr4;
+        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+
+        cl_int ret;
+        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&a);
+        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype),   (void*)&inf);
+        ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype),   (void*)&sup);
+        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint),    (void*)&nrounds);
+        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&size);
+        OCL_CHECK(ret);
+
+        size_t globalws[1] = {size};
+        size_t localws[1] = {256};
+        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
+}
+template void caffe_gpu_uniform<float>(float* a, const unsigned int n, float inf, float sup, unsigned int seed_);
+template void caffe_gpu_uniform<double>(double* a, const unsigned int n, double inf, double sup, unsigned int seed_);
+
+void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed)
+{
+        static unsigned c = 0;
+        if ((n == 0) || (r == NULL)) {
+            c = _seed;
+            return;
+        }
+        std::string kernel_name = "PRNG_threefry4x32_uint_uniform";
+        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+        unsigned nrounds = 20;
+        array4x32  rndctr4;
+        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+        
+        cl_uint inf = 0;
+        cl_uint sup = UINT_MAX;
+        cl_int ret;
+        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&r);
+        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_uint),   (void*)&inf);
+        ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_uint),   (void*)&sup);
+        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint),    (void*)&nrounds);
+        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&size);
+        OCL_CHECK(ret);
+
+        size_t globalws[1] = {size};
+        size_t localws[1] = {256};
+        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
+}
+
+template <typename Dtype>
+void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V)
+{
+        std::string kernel_name = "RNGGaussian" + get_dtype_suffix<Dtype>();
+        cl_kernel ker_rand = amdDevice.GetKernel(kernel_name);
+
+        static unsigned c = 0;
+        unsigned nrounds = 20;
+        array4x32  rndctr4;
+        rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++;
+        cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4
+
+        cl_int ret;
+        ret  = clSetKernelArg(ker_rand, 0, sizeof(cl_mem),     (void*)&a);
+        ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32),  (void*)&rndctr4);
+        ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype),   (void*)&E);
+        ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype),   (void*)&V);
+        ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint),    (void*)&nrounds);
+        ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint),    (void*)&size);
+        OCL_CHECK(ret);
+
+        size_t globalws[1] = {size};
+        size_t localws[1] = {256};
+        OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) );
+}
+template void caffe_gpu_gaussian<float>(float* a, const unsigned int n, float E, float V);
+template void caffe_gpu_gaussian<double>(double* a, const unsigned int n, double E, double V);
+
+template <typename Dtype>
+void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out) {
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
+
+  size_t Global_Work_Size[1] = { (size_t) num };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void exp_gpu<float>(cl_kernel Kernel, const int num, const float* data,
+    float* out);
+template void exp_gpu<double>(cl_kernel Kernel, const int num,
+    const double* data, double* out);
+
+template <typename Dtype>
+void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim,
+    const Dtype* scale, Dtype* data) {
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &scale));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
+
+  size_t Global_Work_Size[1] = { (size_t)(num * dim) };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void softmax_div_gpu<float>(cl_kernel Kernel, const int num,
+    const int dim, const float* scale, float* data);
+template void softmax_div_gpu<double>(cl_kernel Kernel, const int num,
+    const int dim, const double* scale, double* data);
+
+template <typename Dtype>
+Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim,
+    const Dtype* prob_data, const Dtype* label, cl_mem d_loss) {
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &prob_data));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &d_loss));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype), NULL));
+
+  size_t globalws[1] = { 256 };
+  size_t localws[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws,
+          localws, 0, NULL, NULL));
+  void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE,
+      CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL);
+  Dtype loss = *(Dtype*) h_loss;
+  clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL,
+      NULL);
+
+  return loss;
+}
+
+template float softmax_gpu<float>(cl_kernel Kernel, const int num,
+    const int dim, const float* prob_data, const float* label, cl_mem d_loss);
+template double softmax_gpu<double>(cl_kernel Kernel, const int num,
+    const int dim, const double* prob_data, const double* label, cl_mem d_loss);
+
+template <typename Dtype>
+void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, const Dtype* data, Dtype* out) {
+  std::string kernel_name = "kernel_channel_max" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &out));
+
+  size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_channel_max<float>(const int num, const int channels,
+    const int spatial_dim, const float* data, float* out);
+template void kernel_channel_max<double>(const int num, const int channels,
+    const int spatial_dim, const double* data, double* out);
+
+template <typename Dtype>
+void kernel_channel_subtract(const int count, const int num, const int channels,
+    const int spatial_dim, const Dtype* channel_max, Dtype* data) {
+  std::string kernel_name = "kernel_channel_subtract"
+      + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_max));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data));
+
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_channel_subtract<float>(const int count, const int num,
+    const int channels, const int spatial_dim, const float* channel_max,
+    float* data);
+template void kernel_channel_subtract<double>(const int count, const int num,
+    const int channels, const int spatial_dim, const double* channel_max,
+    double* data);
+
+template <typename Dtype>
+void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out) {
+  std::string kernel_name = "kernel_mul" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
+
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_mul<float>(const int count, const float* a, const float* b,
+    float* out);
+template void kernel_mul<double>(const int count, const double* a,
+    const double* b, double* out);
+
+template <typename Dtype>
+void kernel_add_scalar(const int count, const Dtype data, Dtype* out) {
+  std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
+
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_add_scalar<float>(const int count, const float data,
+    float* out);
+template void kernel_add_scalar<double>(const int count, const double data,
+    double* out);
+
+template <typename Dtype>
+void kernel_powx(const int count, const Dtype* data, const Dtype alpha,
+    Dtype* out) {
+  std::string kernel_name = "kernel_powx" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
+
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_powx<float>(const int count, const float* data,
+    const float alpha, float* out);
+template void kernel_powx<double>(const int count, const double* data,
+    const double alpha, double* out);
+
+template <typename Dtype>
+void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out) {
+  std::string kernel_name = "kernel_div" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
+
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_div<float>(const int count, const float* a, const float* b,
+    float* out);
+template void kernel_div<double>(const int count, const double* a,
+    const double* b, double* out);
+
+template <typename Dtype>
+void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out) {
+  std::string kernel_name = "kernel_add" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
+
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_add<float>(const int count, const float* a, const float* b,
+    float* out);
+template void kernel_add<double>(const int count, const double* a,
+    const double* b, double* out);
+
+template <typename Dtype>
+void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out) {
+  std::string kernel_name = "kernel_sub" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out));
+
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_sub<float>(const int count, const float* a, const float* b,
+    float* out);
+template void kernel_sub<double>(const int count, const double* a,
+    const double* b, double* out);
+
+template <typename Dtype>
+void kernel_log(const int count, const Dtype* data, Dtype* out) {
+  std::string kernel_name = "kernel_log" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
+
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_log<float>(const int count, const float* data, float* out);
+template void kernel_log<double>(const int count, const double* data,
+    double* out);
+
+template <typename Dtype>
+void kernel_exp(const int count, const Dtype* data, Dtype* out) {
+  std::string kernel_name = "kernel_exp" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out));
+
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_exp<float>(const int count, const float* data, float* out);
+template void kernel_exp<double>(const int count, const double* data,
+    double* out);
+
+template <typename Dtype>
+void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, const Dtype* data, Dtype* channel_sum) {
+  std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum));
+
+  size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_channel_sum<float>(const int num, const int channels,
+    const int spatial_dim, const float* data, float* channel_sum);
+template void kernel_channel_sum<double>(const int num, const int channels,
+    const int spatial_dim, const double* data, double* channel_sum);
+
+template <typename Dtype>
+void kernel_channel_div(const int count, const int num, const int channels,
+    const int spatial_dim, const Dtype* channel_sum, Dtype* data) {
+  std::string kernel_name = "kernel_channel_div" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data));
+
+  size_t Global_Work_Size[1] = { (size_t) count };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_channel_div<float>(const int count, const int num,
+    const int channels, const int spatial_dim, const float* channel_sum,
+    float* data);
+template void kernel_channel_div<double>(const int count, const int num,
+    const int channels, const int spatial_dim, const double* channel_sum,
+    double* data);
+
+template <typename Dtype>
+void kernel_channel_dot(const int num, const int channels,
+    const int spatial_dim, const Dtype* data_1, const Dtype* data_2,
+    Dtype* channel_dot) {
+  std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data_1));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &data_2));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &channel_dot));
+
+  size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void kernel_channel_dot<float>(const int num, const int channels,
+    const int spatial_dim, const float* data_1, const float* data_2,
+    float* channel_dot);
+template void kernel_channel_dot<double>(const int num, const int channels,
+    const int spatial_dim, const double* data_1, const double* data_2,
+    double* channel_dot);
+
+template <typename Dtype>
+void SoftmaxLossForwardGPU(const int nthreads, const Dtype* prob_data,
+    const Dtype* label, Dtype* loss, const int num, const int dim,
+    const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, Dtype* counts) {
+  std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  int int_has_ignore_label = has_ignore_label_ ? 1 : 0;
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &prob_data));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &loss));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(
+      clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label));
+  OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
+  OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
+
+  size_t Global_Work_Size[1] = { (size_t) nthreads };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void SoftmaxLossForwardGPU<float>(const int nthreads,
+    const float* prob_data, const float* label, float* loss, const int num,
+    const int dim, const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, float* counts);
+template void SoftmaxLossForwardGPU<double>(const int nthreads,
+    const double* prob_data, const double* label, double* loss, const int num,
+    const int dim, const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, double* counts);
+
+template <typename Dtype>
+void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
+    const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+    const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, Dtype* counts) {
+  std::string kernel_name = "SoftmaxLossBackwardGPU"
+      + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  int int_has_ignore_label = has_ignore_label_ ? 1 : 0;
+
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff));
+  OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim));
+  OCL_CHECK(
+      clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label));
+  OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_));
+  OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts));
+
+  size_t Global_Work_Size[1] = { (size_t) nthreads };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void SoftmaxLossBackwardGPU<float>(const int nthreads,
+    const float* top, const float* label, float* bottom_diff, const int num,
+    const int dim, const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, float* counts);
+template void SoftmaxLossBackwardGPU<double>(const int nthreads,
+    const double* top, const double* label, double* bottom_diff, const int num,
+    const int dim, const int spatial_dim, const bool has_ignore_label_,
+    const int ignore_label_, double* counts);
+
+template <typename Dtype>
+void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data) {
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data));
+
+  size_t Global_Work_Size[1] = { (size_t) num };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void scal_gpu<float>(cl_kernel Kernel, const int num,
+    const float alpha, float* data);
+template void scal_gpu<double>(cl_kernel Kernel, const int num,
+    const double alpha, double* data);
+
+template <typename Dtype>
+void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data,
+    const Dtype* label) {
+  OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num));
+  OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim));
+  OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data));
+  OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &label));
+
+  size_t Global_Work_Size[1] = { (size_t) num };
+  size_t Local_Work_Size[1] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void diff_gpu<float>(cl_kernel Kernel, const int num, const int dim,
+    float* data, const float* label);
+template void diff_gpu<double>(cl_kernel Kernel, const int num, const int dim,
+    double* data, const double* label);
+
+template <typename Dtype>
+void max_pool_fp_gpu(cl_kernel Kernel, const int count,
+    const Dtype* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    Dtype* top_data) {
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void max_pool_fp_gpu<float>(cl_kernel Kernel, const int count,
+    const float* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    float* top_data);
+template void max_pool_fp_gpu<double>(cl_kernel Kernel, const int count,
+    const double* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    double* top_data);
+
+template <typename Dtype>
+void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum,
+    const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, Dtype* top_data, int* mask,
+    Dtype* top_mask) {
+  std::string kernel_name = "MaxPoolForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*) &mask);
+  ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &top_mask);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void MaxPoolForward<float>(const int count, const float* bottom_data,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, float* top_data, int* mask,
+    float* top_mask);
+template void MaxPoolForward<double>(const int count, const double* bottom_data,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, double* top_data, int* mask,
+    double* top_mask);
+
+template <typename Dtype>
+void StoPoolForwardTrain(const int count, const Dtype* bottom_data,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    Dtype* idx_data, Dtype* top_data) {
+  std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &idx_data);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void StoPoolForwardTrain<float>(const int count,
+    const float* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_h_, const int kernel_w_,
+    const int stride_h_, const int stride_w_, float* idx_data, float* top_data);
+template void StoPoolForwardTrain<double>(const int count,
+    const double* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_h_, const int kernel_w_,
+    const int stride_h_, const int stride_w_, double* idx_data,
+    double* top_data);
+
+template <typename Dtype>
+void StoPoolForwardTest(const int count, const Dtype* bottom_data,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    Dtype* top_data) {
+  std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+
+}
+template void StoPoolForwardTest<float>(const int count,
+    const float* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_h_, const int kernel_w_,
+    const int stride_h_, const int stride_w_, float* top_data);
+template void StoPoolForwardTest<double>(const int count,
+    const double* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_h_, const int kernel_w_,
+    const int stride_h_, const int stride_w_, double* top_data);
+
+template <typename Dtype>
+void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum,
+    const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, Dtype* top_data) {
+  std::string kernel_name = "AvePoolForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void AvePoolForward<float>(const int count, const float* bottom_data,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, float* top_data);
+template void AvePoolForward<double>(const int count, const double* bottom_data,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_h_,
+    const int kernel_w_, const int stride_h_, const int stride_w_,
+    const int pad_h_, const int pad_w_, double* top_data);
+
+template <typename Dtype>
+void ave_pool_fp_gpu(cl_kernel Kernel, const int count,
+    const Dtype* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, Dtype* top_data) {
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void ave_pool_fp_gpu<float>(cl_kernel Kernel, const int count,
+    const float* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, float* top_data);
+template void ave_pool_fp_gpu<double>(cl_kernel Kernel, const int count,
+    const double* bottom_data, const int clnum, const int channels_,
+    const int height_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, double* top_data);
+
+template <typename Dtype>
+void max_pool_bp_gpu(cl_kernel Kernel, const int count,
+    const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_size_,
+    const int stride_, Dtype* bottom_diff) {
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_size_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void max_pool_bp_gpu<float>(cl_kernel Kernel, const int count,
+    const float* bottom_data, const float* top_data, const float* top_diff,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_size_,
+    const int stride_, float* bottom_diff);
+template void max_pool_bp_gpu<double>(cl_kernel Kernel, const int count,
+    const double* bottom_data, const double* top_data, const double* top_diff,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_size_,
+    const int stride_, double* bottom_diff);
+
+template <typename Dtype>
+void MaxPoolBackward(const int nthreads, const Dtype* const top_diff,
+    const int* const mask, const Dtype* const top_mask, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, Dtype* const bottom_diff) {
+  std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &mask);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_mask);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &kernel_w);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void MaxPoolBackward<float>(const int nthreads,
+    const float* const top_diff, const int* const mask,
+    const float* const top_mask, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    float* const bottom_diff);
+template void MaxPoolBackward<double>(const int nthreads,
+    const double* const top_diff, const int* const mask,
+    const double* const top_mask, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    double* const bottom_diff);
+
+template <typename Dtype>
+void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
+    const int num, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+    const int pad_w, Dtype* const bottom_diff) {
+  std::string kernel_name = "AvePoolBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &num);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w);
+  ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void AvePoolBackward<float>(const int nthreads,
+    const float* const top_diff, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    float* const bottom_diff);
+template void AvePoolBackward<double>(const int nthreads,
+    const double* const top_diff, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    double* const bottom_diff);
+
+template <typename Dtype>
+void StoPoolBackward(const int nthreads, const Dtype* const rand_idx,
+    const Dtype* const top_diff, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, Dtype* const bottom_diff) {
+  std::string kernel_name = "StoPoolBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &rand_idx);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_height);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_width);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_h);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_w);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_h);
+  ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_w);
+  ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void StoPoolBackward<float>(const int nthreads,
+    const float* const rand_idx, const float* const top_diff, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w,
+    float* const bottom_diff);
+template void StoPoolBackward<double>(const int nthreads,
+    const double* const rand_idx, const double* const top_diff, const int num,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w,
+    double* const bottom_diff);
+
+template <typename Dtype>
+void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff,
+    const int clnum, const int channels_, const int height_, const int width_,
+    const int pooled_height_, const int pooled_width_, const int kernel_size_,
+    const int stride_, const int pad_, Dtype* bottom_diff) {
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_);
+  ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_);
+  ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_);
+  ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void ave_pool_bp_gpu<float>(cl_kernel Kernel, const int count,
+    const float* top_diff, const int clnum, const int channels_,
+    const int intheight_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, float* bottom_diff);
+template void ave_pool_bp_gpu<double>(cl_kernel Kernel, const int count,
+    const double* top_diff, const int clnum, const int channels_,
+    const int intheight_, const int width_, const int pooled_height_,
+    const int pooled_width_, const int kernel_size_, const int stride_,
+    const int pad_, double* bottom_diff);
+
+template <typename Dtype>
+void PReLUForward(const int count, const int channels, const int dim,
+    const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data,
+    const int div_factor) {
+  std::string kernel_name = "PReLUForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &slope_data);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &div_factor);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void PReLUForward<float>(const int count, const int channels,
+    const int dim, const float* bottom_data, float* top_data,
+    const float* slope_data, const int div_factor);
+template void PReLUForward<double>(const int count, const int channels,
+    const int dim, const double* bottom_data, double* top_data,
+    const double* slope_data, const int div_factor);
+
+template <typename Dtype>
+void PReLUBackward(const int count, const int channels, const int dim,
+    const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff,
+    const Dtype* slope_data, const int div_factor) {
+  std::string kernel_name = "PReLUBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &slope_data);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &div_factor);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void PReLUBackward<float>(const int count, const int channels,
+    const int dim, const float* top_diff, const float* bottom_data,
+    float* bottom_diff, const float* slope_data, const int div_factor);
+template void PReLUBackward<double>(const int count, const int channels,
+    const int dim, const double* top_diff, const double* bottom_data,
+    double* bottom_diff, const double* slope_data, const int div_factor);
+
+template <typename Dtype>
+void PReLUParamBackward(const int count, const Dtype* top_diff,
+    const int offset_out, const Dtype* bottom_data, const int offset_in,
+    Dtype* bottom_diff) {
+  std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offset_out);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data);
+  ret = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offset_in);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void PReLUParamBackward<float>(const int count, const float* top_diff,
+    const int offset_out, const float* bottom_data, const int offset_in,
+    float* bottom_diff);
+template void PReLUParamBackward<double>(const int count,
+    const double* top_diff, const int offset_out, const double* bottom_data,
+    const int offset_in, double* bottom_diff);
+
+template <typename Dtype>
+void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data,
+    Dtype negative_slope) {
+  std::string kernel_name = "ReLUForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*) &negative_slope);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void ReLUForward<float>(const int count, const float* bottom_data,
+    float* top_data, float negative_slope);
+template void ReLUForward<double>(const int count, const double* bottom_data,
+    double* top_data, double negative_slope);
+
+template <typename Dtype>
+void ReLUBackward(const int count, const Dtype* top_diff,
+    const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) {
+  std::string kernel_name = "ReLUBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*) &negative_slope);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void ReLUBackward<float>(const int count, const float* top_diff,
+    const float* bottom_data, float* bottom_diff, float negative_slope);
+template void ReLUBackward<double>(const int count, const double* top_diff,
+    const double* bottom_data, double* bottom_diff, double negative_slope);
+
+template <typename Dtype>
+void SigmoidForward(const int count, const Dtype* bottom_data,
+    Dtype* top_data) {
+  std::string kernel_name = "SigmoidForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void SigmoidForward<float>(const int count, const float* bottom_data,
+    float* top_data);
+template void SigmoidForward<double>(const int count, const double* bottom_data,
+    double* top_data);
+
+template <typename Dtype>
+void SigmoidBackward(const int count, const Dtype* top_diff,
+    const Dtype* top_data, Dtype* bottom_diff) {
+  std::string kernel_name = "SigmoidBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void SigmoidBackward<float>(const int count, const float* top_diff,
+    const float* top_data, float* bottom_diff);
+template void SigmoidBackward<double>(const int count, const double* top_diff,
+    const double* top_data, double* bottom_diff);
+
+template <typename Dtype>
+void ThresholdForward(const int count, const Dtype threshold,
+    const Dtype* bottom_data, Dtype* top_data) {
+  std::string kernel_name = "ThresholdForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &threshold);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void ThresholdForward<float>(const int count, const float threshold,
+    const float* bottom_data, float* top_data);
+template void ThresholdForward<double>(const int count, const double threshold,
+    const double* bottom_data, double* top_data);
+
+template <typename Dtype>
+void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data) {
+  std::string kernel_name = "TanHForward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void TanHForward<float>(const int count, const float* bottom_data,
+    float* top_data);
+template void TanHForward<double>(const int count, const double* bottom_data,
+    double* top_data);
+
+template <typename Dtype>
+void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data,
+    Dtype* bottom_diff) {
+  std::string kernel_name = "TanHBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) count };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void TanHBackward<float>(const int count, const float* top_diff,
+    const float* top_data, float* bottom_diff);
+template void TanHBackward<double>(const int count, const double* top_diff,
+    const double* top_data, double* bottom_diff);
+
+template <typename Dtype>
+void opttrans(const Dtype* data_im, const int im_offset, const int channels,
+    const int height, const int width, Dtype* data_opt, const int opt_offset,
+    const int optnum) {
+  std::string kernel_name = "opttrans" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  int num_kernels = channels * height * width * optnum;
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt);
+  ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset);
+  ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum);
+  OCL_CHECK(ret);
+
+  size_t uiGlobal_Work_Size[] = { (size_t) num_kernels };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+
+template void opttrans<float>(const float* data_im, const int im_offset,
+    const int channels, const int height, const int width, float* data_opt,
+    const int opt_offset, const int optnum);
+template void opttrans<double>(const double* data_im, const int im_offset,
+    const int channels, const int height, const int width, double* data_opt,
+    const int opt_offset, const int optnum);
+
+template <typename Dtype>
+void LRNFillScale(const int nthreads, const Dtype* const in, const int num,
+    const int channels, const int height, const int width, const int size,
+    const Dtype alpha_over_size, const Dtype k, Dtype* const scale) {
+  std::string kernel_name = "LRNFillScale" + get_dtype_suffix<Dtype>();
+  cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in);
+  ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num);
+  ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size);
+  ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size);
+  ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k);
+  ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void LRNFillScale<float>(const int nthreads, const float* const in,
+    const int num, const int channels, const int height, const int width,
+    const int size, const float alpha_over_size, const float k,
+    float* const scale);
+template void LRNFillScale<double>(const int nthreads, const double* const in,
+    const int num, const int channels, const int height, const int width,
+    const int size, const double alpha_over_size, const double k,
+    double* const scale);
+
+template <typename Dtype>
+void LRNComputeOutput(int nthreads, const Dtype* in, Dtype* scale,
+    Dtype negative_beta, Dtype* out) {
+  std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix<Dtype>();
+  cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in);
+  ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale);
+  ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta);
+  ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size2[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size2[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL,
+          uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL));
+}
+template void LRNComputeOutput<float>(int nthreads, const float* in,
+    float* scale, float negative_beta, float* out);
+template void LRNComputeOutput<double>(int nthreads, const double* in,
+    double* scale, double negative_beta, double* out);
+
+template <typename Dtype>
+void LRNComputeDiff(const int nthreads, const Dtype* const bottom_data,
+    const Dtype* const top_data, const Dtype* const scale,
+    const Dtype* const top_diff, const int num, const int channels,
+    const int height, const int width, const int size,
+    const Dtype negative_beta, const Dtype cache_ratio,
+    Dtype* const bottom_diff) {
+  std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix<Dtype>();
+  cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale);
+  ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num);
+  ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height);
+  ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width);
+  ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size);
+  ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta);
+  ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio);
+  ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+  size_t uiGlobal_Work_Size[] = { (size_t) nthreads };
+  size_t uiLocal_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL,
+          uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL));
+}
+template void LRNComputeDiff<float>(const int nthreads,
+    const float* const bottom_data, const float* const top_data,
+    const float* const scale, const float* const top_diff, const int num,
+    const int channels, const int height, const int width, const int size,
+    const float negative_beta, const float cache_ratio,
+    float* const bottom_diff);
+template void LRNComputeDiff<double>(const int nthreads,
+    const double* const bottom_data, const double* const top_data,
+    const double* const scale, const double* const top_diff, const int num,
+    const int channels, const int height, const int width, const int size,
+    const double negative_beta, const double cache_ratio,
+    double* const bottom_diff);
+
+template <typename Dtype>
+void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) {
+  std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) n };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_add<float>(const int n, const float* in1,
+    const float* in2, float* y);
+template void caffe_gpu_add<double>(const int n, const double* in1,
+    const double* in2, double* y);
+
+template <typename Dtype>
+void caffe_gpu_signbit(const int N, const Dtype* X, Dtype * Y) {
+  std::string kernel_name = "caffe_gpu_sgnbit" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) N };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void caffe_gpu_signbit<float>(const int N, const float* X, float * Y);
+template void caffe_gpu_signbit<double>(const int N, const double* X, double * Y);
+
+template <typename Dtype>
+void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) {
+  std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) N };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_sign_ocl<float>(const int N, const float* X, float* Y);
+template void caffe_gpu_sign_ocl<double>(const int N, const double* X,
+    double* Y);
+
+template <typename Dtype>
+void caffe_gpu_sign_with_offset_ocl(const int N, const Dtype* X, const int offx,  Dtype * Y, const int offy) {
+  std::string kernel_name = "caffe_gpu_sign_with_offset" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offx);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &Y);
+  ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offy);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) N };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_sign_with_offset_ocl<float>(const int N, const float* X, const int offx, float* Y, const int offy);
+template void caffe_gpu_sign_with_offset_ocl<double>(const int N, const double* X, const int offx, double* Y, const int offy);
+
+
+template <typename Dtype>
+void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) {
+  std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) N };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_abs_ocl<float>(const int N, const float* X, float* Y);
+template void caffe_gpu_abs_ocl<double>(const int N, const double* X,
+    double* Y);
+
+template <typename Dtype>
+void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y) {
+  std::string kernel_name = "div" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) n };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_div<float>(const int n, const float* a, const float* b,
+    float* y);
+template void caffe_gpu_div<double>(const int n, const double* a,
+    const double* b, double* y);
+
+template <typename Dtype>
+void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data) {
+  std::string kernel_name = "add_scalar" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) n };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_add_scalar<float>(const int n, const float alpha,
+    float* top_data);
+template void caffe_gpu_add_scalar<double>(const int n, const double alpha,
+    double* top_data);
+
+template <typename Dtype>
+void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y) {
+  std::string kernel_name = "element_mul" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) n };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_mul<float>(const int n, const float* a, const float* b,
+    float* y);
+template void caffe_gpu_mul<double>(const int n, const double* a,
+    const double* b, double* y);
+
+template <typename Dtype>
+void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y) {
+  std::string kernel_name = "powx" + get_dtype_suffix<Dtype>();
+  cl_kernel Kernel = amdDevice.GetKernel(kernel_name);
+  cl_int ret;
+  ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n);
+  ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a);
+  ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha);
+  ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y);
+  OCL_CHECK(ret);
+  size_t Global_Work_Size[] = { (size_t) n };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void caffe_gpu_powx<float>(const int n, const float* a,
+    const float alpha, float* y);
+template void caffe_gpu_powx<double>(const int n, const double* a,
+    const double alpha, double* y);
+
+template <typename Dtype>
+void DropoutForward(const int count, const Dtype* bottom_data,
+    const  unsigned int* MaskMem, const unsigned int threshold,
+    const float scale_, Dtype* top_data) {
+	std::string kernel_name = "DropoutForward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel,  0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &threshold);
+        ret |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void*) &scale_);
+	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &top_data);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+
+template void DropoutForward<float>(const int count, const float* bottom_data,
+		const unsigned int* MaskMem, const unsigned int threshold, 
+                const float scale_, float* top_data);
+template void DropoutForward<double>(const int count, const double* bottom_data,
+		const unsigned int* MaskMem, const unsigned int threshold, 
+                const float scale_, double* top_data);
+
+template <typename Dtype>
+void DropoutBackward(const int count, const Dtype* top_diff, const unsigned int* MaskMem,
+		const unsigned int threshold_, const float scale_, Dtype* bottom_diff) {
+	std::string kernel_name = "DropoutBackward" + get_dtype_suffix<Dtype>();
+	cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+	cl_int ret;
+	ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+	ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+	ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem);
+	ret |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &threshold_);
+	ret |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void*) &scale_);
+	ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff);
+	OCL_CHECK(ret);
+
+	size_t Global_Work_Size[] = { (size_t) count };
+	size_t Local_Work_Size[] = { 256 };
+	OCL_CHECK(
+			clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+					Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void DropoutBackward<float>(const int count, const float* top_diff,
+		const unsigned int* MaskMem, const unsigned int threshold_, const float scale_,
+		float* bottom_diff);
+template void DropoutBackward<double>(const int count, const double* top_diff,
+		const unsigned int* MaskMem, const unsigned int  threshold_, const float scale_,
+		double* bottom_diff);
+
+template <typename Dtype>
+void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) {
+  std::string kernel_name = "BNLLForward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void BNLLForward<float>(const int count, const float* bottom_data,
+    float *top_data);
+template void BNLLForward<double>(const int count, const double* bottom_data,
+    double *top_data);
+
+template <typename Dtype>
+void BNLLBackward(const int count, const Dtype* top_diff,
+    const Dtype* bottom_data, Dtype *bottom_diff) {
+  std::string kernel_name = "BNLLBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void BNLLBackward<float>(const int count, const float* top_diff,
+    const float* bottom_data, float *bottom_diff);
+template void BNLLBackward<double>(const int count, const double* top_diff,
+    const double* bottom_data, double *bottom_diff);
+
+template <typename Dtype>
+void Concat(const int nthreads, const Dtype* in_data, const bool forward,
+    const int num_concats, const int concat_size, const int top_concat_axis,
+    const int bottom_concat_axis, const int offset_concat_axis,
+    Dtype *out_data) {
+  std::string kernel_name = "Concat" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+  int k_forward = (forward == true) ? 1 : 0;
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats);
+  ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size);
+  ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis);
+  ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis);
+  ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis);
+  ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) nthreads };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void Concat<float>(const int nthreads, const float* in_data,
+    const bool forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, float *out_data);
+template void Concat<double>(const int nthreads, const double* in_data,
+    const bool forward, const int num_concats, const int concat_size,
+    const int top_concat_axis, const int bottom_concat_axis,
+    const int offset_concat_axis, double *out_data);
+
+template <typename Dtype>
+void CLLBackward(const int count, const int channels, const Dtype margin,
+    const bool legacy_version, const Dtype alpha, const Dtype* y,
+    const Dtype* diff, const Dtype* dist_sq, Dtype *bottom_diff) {
+  std::string kernel_name = "CLLBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void*) &channels);
+  ret |= clSetKernelArg(kernel, 2, sizeof(Dtype), (void*) &margin);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool), (void*) &legacy_version);
+  ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &alpha);
+  ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &y);
+  ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*) &diff);
+  ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*) &dist_sq);
+  ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) count };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void CLLBackward<float>(const int count, const int channels,
+    const float margin, const bool legacy_version, const float alpha,
+    const float* y, const float* diff, const float* dist_sq,
+    float *bottom_diff);
+template void CLLBackward<double>(const int count, const int channels,
+    const double margin, const bool legacy_version, const double alpha,
+    const double* y, const double* diff, const double* dist_sq,
+    double *bottom_diff);
+
+template <typename Dtype>
+void MaxForward(const int nthreads, const Dtype* bottom_data_a,
+    const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data,
+    int* mask) {
+  std::string kernel_name = "MaxForward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data_a);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data_b);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &blob_idx);
+  ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data);
+  ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &mask);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) nthreads };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void MaxForward<float>(const int nthreads, const float* bottom_data_a,
+    const float* bottom_data_b, const int blob_idx, float* top_data, int* mask);
+template void MaxForward<double>(const int nthreads,
+    const double* bottom_data_a, const double* bottom_data_b,
+    const int blob_idx, double* top_data, int* mask);
+
+template <typename Dtype>
+void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx,
+    const int* mask, Dtype* bottom_diff) {
+  std::string kernel_name = "MaxBackward" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &blob_idx);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &mask);
+  ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &bottom_diff);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) nthreads };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void MaxBackward<float>(const int nthreads, const float* top_diff,
+    const int blob_idx, const int* mask, float* bottom_diff);
+template void MaxBackward<double>(const int nthreads, const double* top_diff,
+    const int blob_idx, const int* mask, double* bottom_diff);
+
+template <typename Dtype>
+void Slice(const int nthreads, const Dtype* in_data,
+    const bool forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, Dtype* out_data) {
+  std::string kernel_name = "Slice" + get_dtype_suffix<Dtype>();
+  cl_kernel kernel = amdDevice.GetKernel(kernel_name);
+  int k_forward = (forward == true) ? 1 : 0;
+  cl_int ret;
+  ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads);
+  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data);
+  ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward);
+  ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_slices);
+  ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &slice_size);
+  ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &bottom_slice_axis);
+  ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &top_slice_axis);
+  ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_slice_axis);
+  ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data);
+  OCL_CHECK(ret);
+
+  size_t Global_Work_Size[] = { (size_t) nthreads };
+  size_t Local_Work_Size[] = { 256 };
+  OCL_CHECK(
+      clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL,
+          Global_Work_Size, Local_Work_Size, 0, NULL, NULL));
+}
+template void Slice<float>(const int nthreads, const float* in_data,
+    const bool forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, float* out_data);
+template void Slice<double>(const int nthreads, const double* in_data,
+    const bool forward, const int num_slices, const int slice_size,
+    const int bottom_slice_axis, const int top_slice_axis,
+    const int offset_slice_axis, double* out_data);
+
+template <typename Dtype>
+void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias,
+    int channel_in, int width, int height, int channel_out, int width_out,
+    int height_out, int kernel_w, int kernel_h, int stride, int pad,
+    int batch_sz) {
+}
+template void ocl_conv<float>(float* bottom_data, float* top_data,
+    float* weights, float* bias, int channel_in, int width, int height,
+    int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
+    int stride, int pad, int batch_sz);
+template void ocl_conv<double>(double* bottom_data, double* top_data,
+    double* weights, double* bias, int channel_in, int width, int height,
+    int channel_out, int width_out, int height_out, int kernel_w, int kernel_h,
+    int stride, int pad, int batch_sz);
+
+#endif
+
+}  // namespace caffe
+
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index 38a06026..028dd884 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -30,7 +30,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param) {
 }
 
 bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers,
-                  NetParameter* net_param) {
+    NetParameter* net_param) {
   // First upgrade padding layers to padded conv layers.
   NetParameter v0_net_param;
   UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param);
@@ -42,7 +42,7 @@ bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers,
   }
   for (int i = 0; i < v0_net_param.layers_size(); ++i) {
     is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i),
-                                                   net_param->add_layers());
+        net_param->add_layers());
   }
   for (int i = 0; i < v0_net_param.input_size(); ++i) {
     net_param->add_input(v0_net_param.input(i));
@@ -57,7 +57,7 @@ bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers,
 }
 
 void UpgradeV0PaddingLayers(const NetParameter& param,
-                            NetParameter* param_upgraded_pad) {
+    NetParameter* param_upgraded_pad) {
   // Copy everything other than the layers from the original param.
   param_upgraded_pad->Clear();
   param_upgraded_pad->CopyFrom(param);
@@ -77,8 +77,8 @@ void UpgradeV0PaddingLayers(const NetParameter& param,
     }
     for (int j = 0; j < layer_connection.bottom_size(); ++j) {
       const string& blob_name = layer_connection.bottom(j);
-      if (blob_name_to_last_top_idx.find(blob_name) ==
-          blob_name_to_last_top_idx.end()) {
+      if (blob_name_to_last_top_idx.find(blob_name)
+          == blob_name_to_last_top_idx.end()) {
         LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
       }
       const int top_idx = blob_name_to_last_top_idx[blob_name];
@@ -93,7 +93,7 @@ void UpgradeV0PaddingLayers(const NetParameter& param,
         // cases have undefined behavior in Caffe.
         CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool"))
             << "Padding layer input to "
-            "non-convolutional / non-pooling layer type "
+                "non-convolutional / non-pooling layer type "
             << layer_param.type();
         CHECK_EQ(layer_connection.bottom_size(), 1)
             << "Conv Layer takes a single blob as input.";
@@ -102,10 +102,10 @@ void UpgradeV0PaddingLayers(const NetParameter& param,
         CHECK_EQ(source_layer.top_size(), 1)
             << "Padding Layer produces a single blob as output.";
         int layer_index = param_upgraded_pad->layers_size() - 1;
-        param_upgraded_pad->mutable_layers(layer_index)->mutable_layer()
-            ->set_pad(source_layer.layer().pad());
-        param_upgraded_pad->mutable_layers(layer_index)
-            ->set_bottom(j, source_layer.bottom(0));
+        param_upgraded_pad->mutable_layers(layer_index)->mutable_layer()->set_pad(
+            source_layer.layer().pad());
+        param_upgraded_pad->mutable_layers(layer_index)->set_bottom(j,
+            source_layer.bottom(0));
       }
     }
     for (int j = 0; j < layer_connection.top_size(); ++j) {
@@ -116,7 +116,7 @@ void UpgradeV0PaddingLayers(const NetParameter& param,
 }
 
 bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
-                             V1LayerParameter* layer_param) {
+    V1LayerParameter* layer_param) {
   bool is_fully_compatible = true;
   layer_param->Clear();
   for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) {
@@ -169,11 +169,11 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
     }
     if (v0_layer_param.has_weight_filler()) {
       if (type == "conv") {
-        layer_param->mutable_convolution_param()->
-            mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
+        layer_param->mutable_convolution_param()->mutable_weight_filler()->CopyFrom(
+            v0_layer_param.weight_filler());
       } else if (type == "innerproduct") {
-        layer_param->mutable_inner_product_param()->
-            mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler());
+        layer_param->mutable_inner_product_param()->mutable_weight_filler()->CopyFrom(
+            v0_layer_param.weight_filler());
       } else {
         LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type;
         is_fully_compatible = false;
@@ -181,11 +181,11 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
     }
     if (v0_layer_param.has_bias_filler()) {
       if (type == "conv") {
-        layer_param->mutable_convolution_param()->
-            mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
+        layer_param->mutable_convolution_param()->mutable_bias_filler()->CopyFrom(
+            v0_layer_param.bias_filler());
       } else if (type == "innerproduct") {
-        layer_param->mutable_inner_product_param()->
-            mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler());
+        layer_param->mutable_inner_product_param()->mutable_bias_filler()->CopyFrom(
+            v0_layer_param.bias_filler());
       } else {
         LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type;
         is_fully_compatible = false;
@@ -322,12 +322,11 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
       }
     }
     if (v0_layer_param.has_scale()) {
-      layer_param->mutable_transform_param()->
-          set_scale(v0_layer_param.scale());
+      layer_param->mutable_transform_param()->set_scale(v0_layer_param.scale());
     }
     if (v0_layer_param.has_meanfile()) {
-      layer_param->mutable_transform_param()->
-          set_mean_file(v0_layer_param.meanfile());
+      layer_param->mutable_transform_param()->set_mean_file(
+          v0_layer_param.meanfile());
     }
     if (v0_layer_param.has_batchsize()) {
       if (type == "data") {
@@ -348,12 +347,12 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
       }
     }
     if (v0_layer_param.has_cropsize()) {
-      layer_param->mutable_transform_param()->
-          set_crop_size(v0_layer_param.cropsize());
+      layer_param->mutable_transform_param()->set_crop_size(
+          v0_layer_param.cropsize());
     }
     if (v0_layer_param.has_mirror()) {
-      layer_param->mutable_transform_param()->
-          set_mirror(v0_layer_param.mirror());
+      layer_param->mutable_transform_param()->set_mirror(
+          v0_layer_param.mirror());
     }
     if (v0_layer_param.has_rand_skip()) {
       if (type == "data") {
@@ -409,7 +408,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
             v0_layer_param.det_fg_threshold());
       } else {
         LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type "
-                   << type;
+            << type;
         is_fully_compatible = false;
       }
     }
@@ -419,7 +418,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
             v0_layer_param.det_bg_threshold());
       } else {
         LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type "
-                   << type;
+            << type;
         is_fully_compatible = false;
       }
     }
@@ -429,7 +428,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
             v0_layer_param.det_fg_fraction());
       } else {
         LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type "
-                   << type;
+            << type;
         is_fully_compatible = false;
       }
     }
@@ -439,7 +438,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
             v0_layer_param.det_context_pad());
       } else {
         LOG(ERROR) << "Unknown parameter det_context_pad for layer type "
-                   << type;
+            << type;
         is_fully_compatible = false;
       }
     }
@@ -448,8 +447,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
         layer_param->mutable_window_data_param()->set_crop_mode(
             v0_layer_param.det_crop_mode());
       } else {
-        LOG(ERROR) << "Unknown parameter det_crop_mode for layer type "
-                   << type;
+        LOG(ERROR) << "Unknown parameter det_crop_mode for layer type " << type;
         is_fully_compatible = false;
       }
     }
@@ -459,7 +457,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
             v0_layer_param.hdf5_output_param());
       } else {
         LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type "
-                   << type;
+            << type;
         is_fully_compatible = false;
       }
     }
@@ -526,24 +524,48 @@ bool NetNeedsDataUpgrade(const NetParameter& net_param) {
   for (int i = 0; i < net_param.layers_size(); ++i) {
     if (net_param.layers(i).type() == V1LayerParameter_LayerType_DATA) {
       DataParameter layer_param = net_param.layers(i).data_param();
-      if (layer_param.has_scale()) { return true; }
-      if (layer_param.has_mean_file()) { return true; }
-      if (layer_param.has_crop_size()) { return true; }
-      if (layer_param.has_mirror()) { return true; }
+      if (layer_param.has_scale()) {
+        return true;
+      }
+      if (layer_param.has_mean_file()) {
+        return true;
+      }
+      if (layer_param.has_crop_size()) {
+        return true;
+      }
+      if (layer_param.has_mirror()) {
+        return true;
+      }
     }
     if (net_param.layers(i).type() == V1LayerParameter_LayerType_IMAGE_DATA) {
       ImageDataParameter layer_param = net_param.layers(i).image_data_param();
-      if (layer_param.has_scale()) { return true; }
-      if (layer_param.has_mean_file()) { return true; }
-      if (layer_param.has_crop_size()) { return true; }
-      if (layer_param.has_mirror()) { return true; }
+      if (layer_param.has_scale()) {
+        return true;
+      }
+      if (layer_param.has_mean_file()) {
+        return true;
+      }
+      if (layer_param.has_crop_size()) {
+        return true;
+      }
+      if (layer_param.has_mirror()) {
+        return true;
+      }
     }
     if (net_param.layers(i).type() == V1LayerParameter_LayerType_WINDOW_DATA) {
       WindowDataParameter layer_param = net_param.layers(i).window_data_param();
-      if (layer_param.has_scale()) { return true; }
-      if (layer_param.has_mean_file()) { return true; }
-      if (layer_param.has_crop_size()) { return true; }
-      if (layer_param.has_mirror()) { return true; }
+      if (layer_param.has_scale()) {
+        return true;
+      }
+      if (layer_param.has_mean_file()) {
+        return true;
+      }
+      if (layer_param.has_crop_size()) {
+        return true;
+      }
+      if (layer_param.has_mirror()) {
+        return true;
+      }
     }
   }
   return false;
@@ -589,7 +611,7 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
     // NetParameter was specified using the old style (V0LayerParameter); try to
     // upgrade it.
     LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-               << "V0LayerParameter: " << param_file;
+        << "V0LayerParameter: " << param_file;
     NetParameter original_param(*param);
     if (!UpgradeV0Net(original_param, param)) {
       success = false;
@@ -597,7 +619,7 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
           << "V0NetParameter to NetParameter (see above); continuing anyway.";
     } else {
       LOG(INFO) << "Successfully upgraded file specified using deprecated "
-                << "V0LayerParameter";
+          << "V0LayerParameter";
     }
     LOG(ERROR) << "Note that future Caffe releases will not support "
         << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for "
@@ -607,16 +629,16 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
   // NetParameter uses old style data transformation fields; try to upgrade it.
   if (NetNeedsDataUpgrade(*param)) {
     LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-               << "transformation parameters: " << param_file;
+        << "transformation parameters: " << param_file;
     UpgradeNetDataTransformation(param);
     LOG(INFO) << "Successfully upgraded file specified using deprecated "
-              << "data transformation parameters.";
+        << "data transformation parameters.";
     LOG(ERROR) << "Note that future Caffe releases will only support "
-               << "transform_param messages for transformation fields.";
+        << "transform_param messages for transformation fields.";
   }
   if (NetNeedsV1ToV2Upgrade(*param)) {
     LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-               << "V1LayerParameter: " << param_file;
+        << "V1LayerParameter: " << param_file;
     NetParameter original_param(*param);
     if (!UpgradeV1Net(original_param, param)) {
       success = false;
@@ -624,7 +646,7 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
           << "V1LayerParameter (see above); continuing anyway.";
     } else {
       LOG(INFO) << "Successfully upgraded file specified using deprecated "
-                << "V1LayerParameter";
+          << "V1LayerParameter";
     }
   }
   return success;
@@ -634,7 +656,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) {
   bool is_fully_compatible = true;
   if (v1_net_param.layer_size() > 0) {
     LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' "
-               << "fields; these will be ignored for the upgrade.";
+        << "fields; these will be ignored for the upgrade.";
     is_fully_compatible = false;
   }
   net_param->CopyFrom(v1_net_param);
@@ -642,7 +664,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) {
   net_param->clear_layer();
   for (int i = 0; i < v1_net_param.layers_size(); ++i) {
     if (!UpgradeV1LayerParameter(v1_net_param.layers(i),
-                                 net_param->add_layer())) {
+        net_param->add_layer())) {
       LOG(ERROR) << "Upgrade of input layer " << i << " failed.";
       is_fully_compatible = false;
     }
@@ -651,7 +673,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) {
 }
 
 bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
-                             LayerParameter* layer_param) {
+    LayerParameter* layer_param) {
   layer_param->Clear();
   bool is_fully_compatible = true;
   for (int i = 0; i < v1_layer_param.bottom_size(); ++i) {
@@ -676,12 +698,16 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
     layer_param->add_blobs()->CopyFrom(v1_layer_param.blobs(i));
   }
   for (int i = 0; i < v1_layer_param.param_size(); ++i) {
-    while (layer_param->param_size() <= i) { layer_param->add_param(); }
+    while (layer_param->param_size() <= i) {
+      layer_param->add_param();
+    }
     layer_param->mutable_param(i)->set_name(v1_layer_param.param(i));
   }
   ParamSpec_DimCheckMode mode;
   for (int i = 0; i < v1_layer_param.blob_share_mode_size(); ++i) {
-    while (layer_param->param_size() <= i) { layer_param->add_param(); }
+    while (layer_param->param_size() <= i) {
+      layer_param->add_param();
+    }
     switch (v1_layer_param.blob_share_mode(i)) {
     case V1LayerParameter_DimCheckMode_STRICT:
       mode = ParamSpec_DimCheckMode_STRICT;
@@ -691,17 +717,21 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
       break;
     default:
       LOG(FATAL) << "Unknown blob_share_mode: "
-                 << v1_layer_param.blob_share_mode(i);
+          << v1_layer_param.blob_share_mode(i);
       break;
     }
     layer_param->mutable_param(i)->set_share_mode(mode);
   }
   for (int i = 0; i < v1_layer_param.blobs_lr_size(); ++i) {
-    while (layer_param->param_size() <= i) { layer_param->add_param(); }
+    while (layer_param->param_size() <= i) {
+      layer_param->add_param();
+    }
     layer_param->mutable_param(i)->set_lr_mult(v1_layer_param.blobs_lr(i));
   }
   for (int i = 0; i < v1_layer_param.weight_decay_size(); ++i) {
-    while (layer_param->param_size() <= i) { layer_param->add_param(); }
+    while (layer_param->param_size() <= i) {
+      layer_param->add_param();
+    }
     layer_param->mutable_param(i)->set_decay_mult(
         v1_layer_param.weight_decay(i));
   }
@@ -729,8 +759,7 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
         v1_layer_param.convolution_param());
   }
   if (v1_layer_param.has_data_param()) {
-    layer_param->mutable_data_param()->CopyFrom(
-        v1_layer_param.data_param());
+    layer_param->mutable_data_param()->CopyFrom(v1_layer_param.data_param());
   }
   if (v1_layer_param.has_dropout_param()) {
     layer_param->mutable_dropout_param()->CopyFrom(
@@ -745,8 +774,7 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
         v1_layer_param.eltwise_param());
   }
   if (v1_layer_param.has_exp_param()) {
-    layer_param->mutable_exp_param()->CopyFrom(
-        v1_layer_param.exp_param());
+    layer_param->mutable_exp_param()->CopyFrom(v1_layer_param.exp_param());
   }
   if (v1_layer_param.has_hdf5_data_param()) {
     layer_param->mutable_hdf5_data_param()->CopyFrom(
@@ -773,28 +801,24 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
         v1_layer_param.inner_product_param());
   }
   if (v1_layer_param.has_lrn_param()) {
-    layer_param->mutable_lrn_param()->CopyFrom(
-        v1_layer_param.lrn_param());
+    layer_param->mutable_lrn_param()->CopyFrom(v1_layer_param.lrn_param());
   }
   if (v1_layer_param.has_memory_data_param()) {
     layer_param->mutable_memory_data_param()->CopyFrom(
         v1_layer_param.memory_data_param());
   }
   if (v1_layer_param.has_mvn_param()) {
-    layer_param->mutable_mvn_param()->CopyFrom(
-        v1_layer_param.mvn_param());
+    layer_param->mutable_mvn_param()->CopyFrom(v1_layer_param.mvn_param());
   }
   if (v1_layer_param.has_pooling_param()) {
     layer_param->mutable_pooling_param()->CopyFrom(
         v1_layer_param.pooling_param());
   }
   if (v1_layer_param.has_power_param()) {
-    layer_param->mutable_power_param()->CopyFrom(
-        v1_layer_param.power_param());
+    layer_param->mutable_power_param()->CopyFrom(v1_layer_param.power_param());
   }
   if (v1_layer_param.has_relu_param()) {
-    layer_param->mutable_relu_param()->CopyFrom(
-        v1_layer_param.relu_param());
+    layer_param->mutable_relu_param()->CopyFrom(v1_layer_param.relu_param());
   }
   if (v1_layer_param.has_sigmoid_param()) {
     layer_param->mutable_sigmoid_param()->CopyFrom(
@@ -805,12 +829,10 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
         v1_layer_param.softmax_param());
   }
   if (v1_layer_param.has_slice_param()) {
-    layer_param->mutable_slice_param()->CopyFrom(
-        v1_layer_param.slice_param());
+    layer_param->mutable_slice_param()->CopyFrom(v1_layer_param.slice_param());
   }
   if (v1_layer_param.has_tanh_param()) {
-    layer_param->mutable_tanh_param()->CopyFrom(
-        v1_layer_param.tanh_param());
+    layer_param->mutable_tanh_param()->CopyFrom(v1_layer_param.tanh_param());
   }
   if (v1_layer_param.has_threshold_param()) {
     layer_param->mutable_threshold_param()->CopyFrom(
@@ -825,8 +847,7 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param,
         v1_layer_param.transform_param());
   }
   if (v1_layer_param.has_loss_param()) {
-    layer_param->mutable_loss_param()->CopyFrom(
-        v1_layer_param.loss_param());
+    layer_param->mutable_loss_param()->CopyFrom(v1_layer_param.loss_param());
   }
   if (v1_layer_param.has_layer()) {
     LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring.";
@@ -924,14 +945,14 @@ const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type) {
 }
 
 void ReadNetParamsFromTextFileOrDie(const string& param_file,
-                                    NetParameter* param) {
+    NetParameter* param) {
   CHECK(ReadProtoFromTextFile(param_file, param))
       << "Failed to parse NetParameter file: " << param_file;
   UpgradeNetAsNeeded(param_file, param);
 }
 
 void ReadNetParamsFromBinaryFileOrDie(const string& param_file,
-                                      NetParameter* param) {
+    NetParameter* param) {
   CHECK(ReadProtoFromBinaryFile(param_file, param))
       << "Failed to parse NetParameter file: " << param_file;
   UpgradeNetAsNeeded(param_file, param);
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 0b7523fc..79b8e127 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -7,6 +7,7 @@
 
 #include "boost/algorithm/string.hpp"
 #include "caffe/caffe.hpp"
+#include "caffe/device.hpp"
 
 using caffe::Blob;
 using caffe::Caffe;
@@ -15,7 +16,9 @@ using caffe::Layer;
 using caffe::shared_ptr;
 using caffe::Timer;
 using caffe::vector;
-
+#ifndef CPU_ONLY
+using caffe::amdDevice;
+#endif
 
 DEFINE_int32(gpu, -1,
     "Run in GPU mode on given device ID.");
@@ -117,7 +120,7 @@ int train() {
     LOG(INFO) << "Use CPU.";
     Caffe::set_mode(Caffe::CPU);
   }
-
+  
   LOG(INFO) << "Starting Optimization";
   shared_ptr<caffe::Solver<float> >
     solver(caffe::GetSolver<float>(solver_param));
@@ -246,6 +249,9 @@ int time() {
   std::vector<double> backward_time_per_layer(layers.size(), 0.0);
   double forward_time = 0.0;
   double backward_time = 0.0;
+#ifndef CPU_ONLY
+  clFinish(amdDevice.CommandQueue);
+#endif
   for (int j = 0; j < FLAGS_iterations; ++j) {
     Timer iter_timer;
     iter_timer.Start();
@@ -253,6 +259,9 @@ int time() {
     for (int i = 0; i < layers.size(); ++i) {
       timer.Start();
       layers[i]->Forward(bottom_vecs[i], top_vecs[i]);
+#ifndef CPU_ONLY
+      clFinish(amdDevice.CommandQueue);
+#endif
       forward_time_per_layer[i] += timer.MicroSeconds();
     }
     forward_time += forward_timer.MicroSeconds();
@@ -261,6 +270,9 @@ int time() {
       timer.Start();
       layers[i]->Backward(top_vecs[i], bottom_need_backward[i],
                           bottom_vecs[i]);
+#ifndef CPU_ONLY
+      clFinish(amdDevice.CommandQueue);
+#endif
       backward_time_per_layer[i] += timer.MicroSeconds();
     }
     backward_time += backward_timer.MicroSeconds();
@@ -291,8 +303,9 @@ int time() {
 RegisterBrewFunction(time);
 
 int main(int argc, char** argv) {
+  FLAGS_log_dir = "./log/";
   // Print output to stderr (while still logging).
-  FLAGS_alsologtostderr = 1;
+  FLAGS_alsologtostderr = 0;
   // Usage message.
   gflags::SetUsageMessage("command line brew\n"
       "usage: caffe <command> <args>\n\n"