diff --git a/.gitignore b/.gitignore index 28f2aca8..5a2ad423 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,7 @@ LOCK LOG* CURRENT MANIFEST-* + +#cmakefiles +src/caffe/test/CMakeFiles +src/caffe/CMakeFiles diff --git a/LICENSE b/LICENSE index d69d16f5..ca91d911 100644 --- a/LICENSE +++ b/LICENSE @@ -42,3 +42,9 @@ CONTRIBUTION AGREEMENT By contributing to the BVLC/caffe repository through pull-request, comment, or otherwise, the contributor releases their content to the license and copyright terms herein. + +AMD license on the OpenCL parts + +AMD holds license for the OpenCL related code, kernels and optimizations. +AMD license is added to the file or part of the file that written by AMD. +For details, please see license declaration for individual file. diff --git a/Makefile b/Makefile index 05b783af..905a19c3 100644 --- a/Makefile +++ b/Makefile @@ -38,13 +38,10 @@ DYNAMIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).so ############################## # CXX_SRCS are the source files excluding the test ones. CXX_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cpp" -name "*.cpp") -# CU_SRCS are the cuda source files -CU_SRCS := $(shell find src/$(PROJECT) ! -name "test_*.cu" -name "*.cu") # TEST_SRCS are the test source files TEST_MAIN_SRC := src/$(PROJECT)/test/test_caffe_main.cpp TEST_SRCS := $(shell find src/$(PROJECT) -name "test_*.cpp") TEST_SRCS := $(filter-out $(TEST_MAIN_SRC), $(TEST_SRCS)) -TEST_CU_SRCS := $(shell find src/$(PROJECT) -name "test_*.cu") GTEST_SRC := src/gtest/gtest-all.cpp # TOOL_SRCS are the source files for the tool binaries TOOL_SRCS := $(shell find tools -name "*.cpp") @@ -68,7 +65,7 @@ NONGEN_CXX_SRCS := $(shell find \ matlab/+$(PROJECT)/private \ examples \ tools \ - -name "*.cpp" -or -name "*.hpp" -or -name "*.cu" -or -name "*.cuh") + -name "*.cpp" -or -name "*.hpp") LINT_SCRIPT := scripts/cpp_lint.py LINT_OUTPUT_DIR := $(BUILD_DIR)/.lint LINT_EXT := lint.txt @@ -103,22 +100,19 @@ PROTO_GEN_PY := $(foreach file,${PROTO_SRCS:.proto=_pb2.py}, \ # These objects will be linked into the final shared library, so we # exclude the tool, example, and test objects. CXX_OBJS := $(addprefix $(BUILD_DIR)/, ${CXX_SRCS:.cpp=.o}) -CU_OBJS := $(addprefix $(BUILD_DIR)/cuda/, ${CU_SRCS:.cu=.o}) PROTO_OBJS := ${PROTO_GEN_CC:.cc=.o} -OBJS := $(PROTO_OBJS) $(CXX_OBJS) $(CU_OBJS) +OBJS := $(PROTO_OBJS) $(CXX_OBJS) # tool, example, and test objects TOOL_OBJS := $(addprefix $(BUILD_DIR)/, ${TOOL_SRCS:.cpp=.o}) TOOL_BUILD_DIR := $(BUILD_DIR)/tools TEST_CXX_BUILD_DIR := $(BUILD_DIR)/src/$(PROJECT)/test -TEST_CU_BUILD_DIR := $(BUILD_DIR)/cuda/src/$(PROJECT)/test TEST_CXX_OBJS := $(addprefix $(BUILD_DIR)/, ${TEST_SRCS:.cpp=.o}) -TEST_CU_OBJS := $(addprefix $(BUILD_DIR)/cuda/, ${TEST_CU_SRCS:.cu=.o}) -TEST_OBJS := $(TEST_CXX_OBJS) $(TEST_CU_OBJS) +TEST_OBJS := $(TEST_CXX_OBJS) GTEST_OBJ := $(addprefix $(BUILD_DIR)/, ${GTEST_SRC:.cpp=.o}) EXAMPLE_OBJS := $(addprefix $(BUILD_DIR)/, ${EXAMPLE_SRCS:.cpp=.o}) # Output files for automatic dependency generation -DEPS := ${CXX_OBJS:.o=.d} ${CU_OBJS:.o=.d} ${TEST_CXX_OBJS:.o=.d} \ - ${TEST_CU_OBJS:.o=.d} $(BUILD_DIR)/${MAT$(PROJECT)_SO:.$(MAT_SO_EXT)=.d} +DEPS := ${CXX_OBJS:.o=.d} ${TEST_CXX_OBJS:.o=.d} \ + $(BUILD_DIR)/${MAT$(PROJECT)_SO:.$(MAT_SO_EXT)=.d} # tool, example, and test bins TOOL_BINS := ${TOOL_OBJS:.o=.bin} EXAMPLE_BINS := ${EXAMPLE_OBJS:.o=.bin} @@ -126,11 +120,9 @@ EXAMPLE_BINS := ${EXAMPLE_OBJS:.o=.bin} TOOL_BIN_LINKS := ${TOOL_BINS:.bin=} # Put the test binaries in build/test for convenience. TEST_BIN_DIR := $(BUILD_DIR)/test -TEST_CU_BINS := $(addsuffix .testbin,$(addprefix $(TEST_BIN_DIR)/, \ - $(foreach obj,$(TEST_CU_OBJS),$(basename $(notdir $(obj)))))) TEST_CXX_BINS := $(addsuffix .testbin,$(addprefix $(TEST_BIN_DIR)/, \ $(foreach obj,$(TEST_CXX_OBJS),$(basename $(notdir $(obj)))))) -TEST_BINS := $(TEST_CXX_BINS) $(TEST_CU_BINS) +TEST_BINS := $(TEST_CXX_BINS) # TEST_ALL_BIN is the test binary that links caffe dynamically. TEST_ALL_BIN := $(TEST_BIN_DIR)/test_all.testbin @@ -139,35 +131,45 @@ TEST_ALL_BIN := $(TEST_BIN_DIR)/test_all.testbin ############################## WARNS_EXT := warnings.txt CXX_WARNS := $(addprefix $(BUILD_DIR)/, ${CXX_SRCS:.cpp=.o.$(WARNS_EXT)}) -CU_WARNS := $(addprefix $(BUILD_DIR)/cuda/, ${CU_SRCS:.cu=.o.$(WARNS_EXT)}) TOOL_WARNS := $(addprefix $(BUILD_DIR)/, ${TOOL_SRCS:.cpp=.o.$(WARNS_EXT)}) EXAMPLE_WARNS := $(addprefix $(BUILD_DIR)/, ${EXAMPLE_SRCS:.cpp=.o.$(WARNS_EXT)}) TEST_WARNS := $(addprefix $(BUILD_DIR)/, ${TEST_SRCS:.cpp=.o.$(WARNS_EXT)}) -TEST_CU_WARNS := $(addprefix $(BUILD_DIR)/cuda/, ${TEST_CU_SRCS:.cu=.o.$(WARNS_EXT)}) ALL_CXX_WARNS := $(CXX_WARNS) $(TOOL_WARNS) $(EXAMPLE_WARNS) $(TEST_WARNS) -ALL_CU_WARNS := $(CU_WARNS) $(TEST_CU_WARNS) -ALL_WARNS := $(ALL_CXX_WARNS) $(ALL_CU_WARNS) +ALL_WARNS := $(ALL_CXX_WARNS) EMPTY_WARN_REPORT := $(BUILD_DIR)/.$(WARNS_EXT) NONEMPTY_WARN_REPORT := $(BUILD_DIR)/$(WARNS_EXT) -############################## -# Derive include and lib directories -############################## -CUDA_INCLUDE_DIR := $(CUDA_DIR)/include +################################# +# OpenCL include and library +################################# +OCL_INCLUDE_DIR := $(OCL_DIR)/include +CLBLAS_INCLUDE_DIR := ${CLBLAS_DIR}/include + +OCL_LIB_DIR := +CLBLAS_LIB_DIR := +# add /lib/x86_64 only if it exists +ifneq ("$(wildcard $(OCL_LIB_DIR)/lib/x86_64)","") + OCL_LIB_DIR += $(OCL_DIR)/lib/x86_64 +endif +OCL_LIB_DIR += $(OCL_DIR)/lib/x86 + +# add /lib/ only if it exists +ifneq ("$(wildcard $(CLBLAS_DIR)/lib)","") + CLBLAS_LIB_DIR += $(CLBLAS_LIB_DIR)/lib +endif -CUDA_LIB_DIR := -# add /lib64 only if it exists -ifneq ("$(wildcard $(CUDA_DIR)/lib64)","") - CUDA_LIB_DIR += $(CUDA_DIR)/lib64 +# add /lib64/ only if it exists +ifneq ("$(wildcard $(CLBLAS_DIR)/lib64)","") + CLBLAS_LIB_DIR += $(CLBLAS_LIB_DIR)/lib64 endif -CUDA_LIB_DIR += $(CUDA_DIR)/lib INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include ifneq ($(CPU_ONLY), 1) - INCLUDE_DIRS += $(CUDA_INCLUDE_DIR) - LIBRARY_DIRS += $(CUDA_LIB_DIR) - LIBRARIES := cudart cublas curand + INCLUDE_DIRS += $(OCL_INCLUDE_DIR) + $(CLBLAS_INCLUDE_DIR) + LIBRARY_DIRS += $(OCL_LIB_DIR) + $(CLBLAS_LIB_DIR) + LIBRARIES += OpenCL clBLAS + endif LIBRARIES += glog gflags protobuf leveldb snappy \ lmdb boost_system hdf5_hl hdf5 m \ @@ -187,7 +189,6 @@ ifneq ($(strip $(DISTRIBUTE_DIR)),distribute) endif ALL_BUILD_DIRS := $(sort $(BUILD_DIR) $(addprefix $(BUILD_DIR)/, $(SRC_DIRS)) \ - $(addprefix $(BUILD_DIR)/cuda/, $(SRC_DIRS)) \ $(LIB_BUILD_DIR) $(TEST_BIN_DIR) $(PY_PROTO_BUILD_DIR) $(LINT_OUTPUT_DIR) \ $(DISTRIBUTE_SUBDIRS) $(PROTO_BUILD_INCLUDE_DIR)) @@ -206,7 +207,7 @@ DOXYGEN_SOURCES := $(shell find \ matlab/ \ examples \ tools \ - -name "*.cpp" -or -name "*.hpp" -or -name "*.cu" -or -name "*.cuh" -or \ + -name "*.cpp" -or -name "*.hpp" -or \ -name "*.py" -or -name "*.m") DOXYGEN_SOURCES += $(DOXYGEN_CONFIG_FILE) @@ -242,13 +243,8 @@ endif ifeq ($(OSX), 1) CXX := /usr/bin/clang++ ifneq ($(CPU_ONLY), 1) - CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release \d' | grep -o '\d') - ifeq ($(shell echo | awk '{exit $(CUDA_VERSION) < 7.0;}'), 1) - CXXFLAGS += -stdlib=libstdc++ - LINKFLAGS += -stdlib=libstdc++ - endif - # clang throws this warning for cuda headers - WARNINGS += -Wno-unneeded-internal-declaration + # todo + ############# endif # gtest needs to use its own tuple to not conflict with clang COMMON_FLAGS += -DGTEST_USE_OWN_TR1_TUPLE=1 @@ -284,12 +280,6 @@ else COMMON_FLAGS += -DNDEBUG -O2 endif -# cuDNN acceleration configuration. -ifeq ($(USE_CUDNN), 1) - LIBRARIES += cudnn - COMMON_FLAGS += -DUSE_CUDNN -endif - # CPU-only configuration ifeq ($(CPU_ONLY), 1) OBJS := $(PROTO_OBJS) $(CXX_OBJS) @@ -374,7 +364,7 @@ PYTHON_LDFLAGS := $(LDFLAGS) $(foreach library,$(PYTHON_LIBRARIES),-l$(library)) # # * Recursive with the exception that symbolic links are never followed, per the # default behavior of 'find'. -SUPERCLEAN_EXTS := .so .a .o .bin .testbin .pb.cc .pb.h _pb2.py .cuo +SUPERCLEAN_EXTS := .so .a .o .bin .testbin .pb.cc .pb.h _pb2.py # Set the sub-targets of the 'everything' target. EVERYTHING_TARGETS := all py$(PROJECT) test warn lint @@ -525,26 +515,12 @@ $(PROTO_BUILD_DIR)/%.pb.o: $(PROTO_BUILD_DIR)/%.pb.cc $(PROTO_GEN_HEADER) \ || (cat $@.$(WARNS_EXT); exit 1) @ cat $@.$(WARNS_EXT) -$(BUILD_DIR)/cuda/%.o: %.cu | $(ALL_BUILD_DIRS) - @ echo NVCC $< - $(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \ - -odir $(@D) - $(Q)$(CUDA_DIR)/bin/nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ 2> $@.$(WARNS_EXT) \ - || (cat $@.$(WARNS_EXT); exit 1) - @ cat $@.$(WARNS_EXT) - $(TEST_ALL_BIN): $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \ | $(DYNAMIC_NAME) $(TEST_BIN_DIR) @ echo CXX/LD -o $@ $< $(Q)$(CXX) $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \ -o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib -$(TEST_CU_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CU_BUILD_DIR)/%.o \ - $(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR) - @ echo LD $< - $(Q)$(CXX) $(TEST_MAIN_SRC) $< $(GTEST_OBJ) \ - -o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib - $(TEST_CXX_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CXX_BUILD_DIR)/%.o \ $(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR) @ echo LD $< diff --git a/Makefile.config b/Makefile.config new file mode 100644 index 00000000..eea4c1f3 --- /dev/null +++ b/Makefile.config @@ -0,0 +1,100 @@ +## Refer to http://caffe.berkeleyvision.org/installation.html +# Contributions simplifying and improving our build system are welcome! + +# Use OpenCL + USE_OPENCL := 1 +# OpenCL directory + OCL_DIR := /opt/AMDAPPSDK-2.9-1 +# clBLAS directory + CLBLAS_DIR := /opt/clBLAS-2.1 + +# cuDNN acceleration switch (uncomment to build with cuDNN). +# USE_CUDNN := 1 + +# CPU-only switch (uncomment to build without GPU support). +# CPU_ONLY := 1 + +# To customize your choice of compiler, uncomment and set the following. +# N.B. the default for Linux is g++ and the default for OSX is clang++ +# CUSTOM_CXX := g++ + +# CUDA directory contains bin/ and lib/ directories that we need. +#CUDA_DIR := /usr/local/cuda +# On Ubuntu 14.04, if cuda tools are installed via +# "sudo apt-get install nvidia-cuda-toolkit" then use this instead: +# CUDA_DIR := /usr + +# CUDA architecture setting: going with all of them. +# For CUDA < 6.0, comment the *_50 lines for compatibility. +#CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \ + -gencode arch=compute_20,code=sm_21 \ + -gencode arch=compute_30,code=sm_30 \ + -gencode arch=compute_35,code=sm_35 \ + -gencode arch=compute_50,code=sm_50 \ + -gencode arch=compute_50,code=compute_50 + +# BLAS choice: +# atlas for ATLAS (default) +# mkl for MKL +# open for OpenBlas +BLAS := atlas +# Custom (MKL/ATLAS/OpenBLAS) include and lib directories. +# Leave commented to accept the defaults for your choice of BLAS +# (which should work)! +# BLAS_INCLUDE := /path/to/your/blas +# BLAS_LIB := /path/to/your/blas + +# Homebrew puts openblas in a directory that is not on the standard search path +# BLAS_INCLUDE := $(shell brew --prefix openblas)/include +# BLAS_LIB := $(shell brew --prefix openblas)/lib + +# This is required only if you will compile the matlab interface. +# MATLAB directory should contain the mex binary in /bin. +# MATLAB_DIR := /usr/local +# MATLAB_DIR := /Applications/MATLAB_R2012b.app + +# NOTE: this is required only if you will compile the python interface. +# We need to be able to find Python.h and numpy/arrayobject.h. +PYTHON_INCLUDE := /usr/include/python2.7 \ + /usr/lib/python2.7/dist-packages/numpy/core/include +# Anaconda Python distribution is quite popular. Include path: +# Verify anaconda location, sometimes it's in root. +# ANACONDA_HOME := $(HOME)/anaconda +# PYTHON_INCLUDE := $(ANACONDA_HOME)/include \ + # $(ANACONDA_HOME)/include/python2.7 \ + # $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \ + +# We need to be able to find libpythonX.X.so or .dylib. +PYTHON_LIB := /usr/lib +# PYTHON_LIB := $(ANACONDA_HOME)/lib + +# Homebrew installs numpy in a non standard path (keg only) +# PYTHON_INCLUDE += $(dir $(shell python -c 'import numpy.core; print(numpy.core.__file__)'))/include +# PYTHON_LIB += $(shell brew --prefix numpy)/lib + +# Uncomment to support layers written in Python (will link against Python libs) +# WITH_PYTHON_LAYER := 1 + +# Whatever else you find you need goes here. +INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include +LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib + +# If Homebrew is installed at a non standard location (for example your home directory) and you use it for general dependencies +# INCLUDE_DIRS += $(shell brew --prefix)/include +# LIBRARY_DIRS += $(shell brew --prefix)/lib + +# Uncomment to use `pkg-config` to specify OpenCV library paths. +# (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.) +# USE_PKG_CONFIG := 1 + +BUILD_DIR := build +DISTRIBUTE_DIR := distribute + +# Uncomment for debugging. Does not work on OSX due to https://github.com/BVLC/caffe/issues/171 + DEBUG := 1 + +# The ID of the GPU that 'make runtest' will use to run unit tests. +TEST_GPUID := 0 + +# enable pretty build (comment to see full commands) +Q ?= @ diff --git a/README.md b/README.md index ebec286d..ebc83a1a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,57 @@ -# Caffe +#OpenCL Caffe + +This is an OpenCL implementation of Caffe, a mainstream DNN framework (https://github.com/BVLC/caffe). It includes a largely complete Caffe feature set as of August 2015. The project is under active development to improve performance and add new features. Contributions from the community are welcome. + +OpenCL (https://en.wikipedia.org/wiki/OpenCL) is an open standard parallel programming language for heterogeneous platforms. OpenCL is supported by a variety of commercial chip manufacturers. + +#Design features + -All Caffe layers ported to OpenCL + + -Performance improvement by batched implementation for conv layer based on clBLAS + + -The user can choose the optimal batch number depending on H/W properties, image size and minibatch size + + -Supports OpenCL 2.0, 1.2 + + -Implemented in C++ and OpenCL, maintaining the same interfaces as the original Caffe + + -Users can directly run DNN models: AlexNet, VGG-16 and VGG-19 + +Note: More features are planned in the near future. Currently this implementation has been verified and tuned on AMD devices (CPUs/GPUs/APUs). Compatibility across different chip manufacturers will be considered for future addition. + +#Performance + +We intend to keep updating the latest performance as we make optimizations. Fury results are preliminary and are actively being improved. + +* Training speed (Model: AlexNet, minibatch size 128) + + -AMD W9100, 255 images per second + + -AMD R9 Fury, 261 images per second + +* Recognition speed (Model: AlexNet, minibatch size 128) + + -AMD W9100, 590 images per second + + -AMD R9 Fury, 699 images per second + +#Wiki +For more information on how to install, use or contribute to this code base, please visit our wiki page: + https://github.com/amd/OpenCL-caffe/wiki + +#Contributors +Junli Gu, Yibing Liu, Yuan Gao, Maohua Zhu + +We thank Mauricio Breternitz, Hanjin Chu and Greg Stoner for their technical suggestions and support. + +#Support needed + As an open source project, we hope to maintain an open dynamics and sharing culture. We encourage the contribution and support from the community to improve it together. + +#License +The original Caffe is provided in the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE) open source license. The OpenCL ports written by AMD is covered by AMD license. We encourage the contribution and support from external, your contribution will be covered either by BSD 2-Clause license or whichever your preferred license. + +# Original Caffe information +## Caffe Caffe is a deep learning framework made with expression, speed, and modularity in mind. It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and community contributors. diff --git a/cmake/CaffeConfig.cmake b/cmake/CaffeConfig.cmake new file mode 100644 index 00000000..076edc5d --- /dev/null +++ b/cmake/CaffeConfig.cmake @@ -0,0 +1,61 @@ +# Config file for the Caffe package. +# +# Note: +# Caffe and this config file depends on opencv, +# so put `find_package(OpenCV)` before searching Caffe +# via `find_package(Caffe)`. All other lib/includes +# dependencies are hard coded in the file +# +# After successful configuration the following variables +# will be defined: +# +# Caffe_INCLUDE_DIRS - Caffe include directories +# Caffe_LIBRARIES - libraries to link against +# Caffe_DEFINITIONS - a list of definitions to pass to compiler +# +# Caffe_HAVE_CUDA - signals about CUDA support +# Caffe_HAVE_CUDNN - signals about cuDNN support + + +# OpenCV dependency + +if(NOT OpenCV_FOUND) + set(Caffe_OpenCV_CONFIG_PATH "/usr/local/share/OpenCV") + if(Caffe_OpenCV_CONFIG_PATH) + get_filename_component(Caffe_OpenCV_CONFIG_PATH ${Caffe_OpenCV_CONFIG_PATH} ABSOLUTE) + + if(EXISTS ${Caffe_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core) + message(STATUS "Caffe: using OpenCV config from ${Caffe_OpenCV_CONFIG_PATH}") + include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVModules.cmake) + endif() + + else() + find_package(OpenCV REQUIRED) + endif() + unset(Caffe_OpenCV_CONFIG_PATH) +endif() + +# Compute paths +get_filename_component(Caffe_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) +set(Caffe_INCLUDE_DIRS "/usr/local/include;/usr/include;/opt/AMDAPPSDK-2.9-1/include;/opt/clBLAS-2.1/include;/usr/local/include/opencv;/usr/include/atlas") + +get_filename_component(__caffe_include "${Caffe_CMAKE_DIR}/../../include" ABSOLUTE) +list(APPEND Caffe_INCLUDE_DIRS ${__caffe_include}) +unset(__caffe_include) + + +# Our library dependencies +if(NOT TARGET caffe AND NOT caffe_BINARY_DIR) + include("${Caffe_CMAKE_DIR}/CaffeTargets.cmake") +endif() + +# List of IMPORTED libs created by CaffeTargets.cmake +set(Caffe_LIBRARIES caffe) + +# Definitions +set(Caffe_DEFINITIONS "-DCPU_ONLY") + +# Cuda support variables +set(Caffe_CPU_ONLY OFF) +set(Caffe_HAVE_CUDA FALSE) +set(Caffe_HAVE_CUDNN FALSE) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 7c86dd55..eb72e89f 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -44,17 +44,27 @@ include_directories(SYSTEM ${Snappy_INCLUDE_DIR}) list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES}) # ---[ CUDA -include(cmake/Cuda.cmake) -if(NOT HAVE_CUDA) - if(CPU_ONLY) - message("-- CUDA is disabled. Building without it...") - else() - message("-- CUDA is not detected by cmake. Building without it...") - endif() +#include(cmake/Cuda.cmake) +#if(NOT HAVE_CUDA) +# if(CPU_ONLY) +# message("-- CUDA is disabled. Building without it...") +# else() +# message("-- CUDA is not detected by cmake. Building without it...") +# endif() # TODO: remove this not cross platform define in future. Use caffe_config.h instead. - add_definitions(-DCPU_ONLY) -endif() +# add_definitions(-DCPU_ONLY) +#endif() + +# ---[ OpenCL +find_package(OpenCL REQUIRED) +include_directories(SYSTEM ${OPENCL_INCLUDE_DIRS}) +list(APPEND Caffe_LINKER_LIBS ${OPENCL_LIBRARIES}) + +# ---[ clBLAS +find_package(clBLAS REQUIRED) +include_directories(SYSTEM ${CLBLAS_INCLUDE_DIRS}) +list(APPEND Caffe_LINKER_LIBS ${CLBLAS_LIBRARIES}) # ---[ OpenCV find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs) diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake new file mode 100644 index 00000000..93abd4f9 --- /dev/null +++ b/cmake/Modules/FindOpenCL.cmake @@ -0,0 +1,108 @@ +# ######################################################################## +# Copyright 2013 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ######################################################################## + + +# Locate an OpenCL implementation. +# Currently supports AMD APP SDK (http://developer.amd.com/sdks/AMDAPPSDK/Pages/default.aspx/) +# +# Defines the following variables: +# +# OPENCL_FOUND - Found the OPENCL framework +# OPENCL_INCLUDE_DIRS - Include directories +# +# Also defines the library variables below as normal +# variables. These contain debug/optimized keywords when +# a debugging library is found. +# +# OPENCL_LIBRARIES - libopencl +# +# Accepts the following variables as input: +# +# OPENCL_ROOT - (as a CMake or environment variable) +# The root directory of the OpenCL implementation found +# +# FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findOpenCL should search for +# 64bit or 32bit libs +#----------------------- +# Example Usage: +# +# find_package(OPENCL REQUIRED) +# include_directories(${OPENCL_INCLUDE_DIRS}) +# +# add_executable(foo foo.cc) +# target_link_libraries(foo ${OPENCL_LIBRARIES}) +# +#----------------------- + +set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON) + +find_path(OPENCL_INCLUDE_DIRS + NAMES OpenCL/cl.h CL/cl.h + HINTS + ${OPENCL_ROOT}/include + $ENV{AMDAPPSDKROOT}/include + $ENV{CUDA_PATH}/include + PATHS + /usr/include + /usr/local/include + /usr/local/cuda/include + /opt/cuda/include + DOC "OpenCL header file path" +) +mark_as_advanced( OPENCL_INCLUDE_DIRS ) + +# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else +get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ) + +if( LIB64 ) + find_library( OPENCL_LIBRARIES + NAMES OpenCL + HINTS + ${OPENCL_ROOT}/lib + $ENV{AMDAPPSDKROOT}/lib + $ENV{CUDA_PATH}/lib + DOC "OpenCL dynamic library path" + PATH_SUFFIXES x86_64 x64 x86_64/sdk + PATHS + /usr/lib + /usr/local/cuda/lib + /opt/cuda/lib + ) +else( ) + find_library( OPENCL_LIBRARIES + NAMES OpenCL + HINTS + ${OPENCL_ROOT}/lib + $ENV{AMDAPPSDKROOT}/lib + $ENV{CUDA_PATH}/lib + DOC "OpenCL dynamic library path" + PATH_SUFFIXES x86 Win32 + PATHS + /usr/lib + /usr/local/cuda/lib + /opt/cuda/lib + ) +endif( ) +mark_as_advanced( OPENCL_LIBRARIES ) + +include( FindPackageHandleStandardArgs ) +FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS ) + +if( NOT OPENCL_FOUND ) + message( STATUS "FindOpenCL looked for libraries named: OpenCL" ) +else () + message( STATUS "Found OpenCL (include: ${OPENCL_INCLUDE_DIRS}, library: ${OPENCL_LIBRARIES})") +endif() diff --git a/cmake/Modules/FindclBLAS.cmake b/cmake/Modules/FindclBLAS.cmake new file mode 100644 index 00000000..1fa28762 --- /dev/null +++ b/cmake/Modules/FindclBLAS.cmake @@ -0,0 +1,98 @@ +# ######################################################################## +# Copyright 2013 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ######################################################################## + + +# Locate an clBLAS library. +# +# Defines the following variables: +# +# CLBLAS_FOUND - Found the CLBLAS library +# CLBLAS_INCLUDE_DIRS - Include directories +# +# Also defines the library variables below as normal +# variables. These contain debug/optimized keywords when +# a debugging library is found. +# +# CLBLAS_LIBRARIES - libclBLAS +# +# Accepts the following variables as input: +# +# CLBLAS_ROOT - (as a CMake or environment variable) +# The root directory of the clBLAS library found +# +# FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findclBLAS should search for +# 64bit or 32bit libs +#----------------------- +# Example Usage: +# +# find_package(clBLAS REQUIRED) +# include_directories(${CLBLAS_INCLUDE_DIRS}) +# +# add_executable(foo foo.cc) +# target_link_libraries(foo ${CLBLAS_LIBRARIES}) +# +#----------------------- + +set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ON) + +find_path(CLBLAS_INCLUDE_DIRS NAMES clBLAS.h + HINTS + $ENV{CLBLAS_ROOT}/include + PATHS + /usr/include + /usr/local/include + DOC "clBLAS header file path" +) +mark_as_advanced( CLBLAS_INCLUDE_DIRS ) + +# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else +get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ) + +if( LIB64 ) + find_library( CLBLAS_LIBRARIES + NAMES clBLAS + HINTS + $ENV{CLBLAS_ROOT}/lib64 + DOC "clBLAS dynamic library path" + PATHS + /usr/lib + /usr/local/lib + ) +else( ) + find_library( CLBLAS_LIBRARIES + NAMES clBLAS + HINTS + $ENV{CLBLAS_ROOT}/lib + DOC "clBLAS dynamic library path" + PATHS + /usr/lib + /usr/local/lib + ) +endif( ) +mark_as_advanced( CLBLAS_LIBRARIES ) + +if (NOT CLBLAS_INCLUDE_DIRS) + set(CLBLAS_FOUND ON) +endif() + +include( FindPackageHandleStandardArgs ) +FIND_PACKAGE_HANDLE_STANDARD_ARGS( CLBLAS DEFAULT_MSG CLBLAS_LIBRARIES CLBLAS_INCLUDE_DIRS ) + +if( NOT CLBLAS_FOUND ) + message( STATUS "FindclBLAS looked for libraries named: clBLAS" ) +else () + message( STATUS "Found clBLAS (include: ${CLBLAS_INCLUDE_DIRS}, library: ${CLBLAS_LIBRARIES})") +endif() diff --git a/cmake/OpenCL.cmake b/cmake/OpenCL.cmake new file mode 100644 index 00000000..c83ce7eb --- /dev/null +++ b/cmake/OpenCL.cmake @@ -0,0 +1,26 @@ +if(CPU_ONLY) + return() +endif() + +#find_path(OCL_INCLUDE_DIR NAMES CL/cl.h PATHS "$ENV{AMDAPPSDKROOT}/include") +#find_library(OCL_LIBRARIES NAMES libOpenCL.so PATHS "$ENV{AMDAPPSDKROOT}/lib/x86_64") + +#find_path(CLBLAS_INCLUDE_DIR NAMES clBLAS.h PATHS /opt/clBLAS-2.1/include $ENV{C_INCLUDE_PATH} $ENV{CPLUS_INCLUDE_PATH}) +#find_library(CLBLAS_LIBRARIES NAMES libclBLAS.so PATHS $ENV{LD_LIBRARY_PATH}) + +#if(OCL_INCLUDE_DIR AND OCL_LIBRARIES) +# set(OCL_FOUND TRUE PARENT_SCOPE) +# message(STATUS "Found OpenCL (include: ${OCL_INCLUDE_DIR}, library: ${OCL_LIBRARIES})") +#endif() + +#if(CLBLAS_INCLUDE_DIR AND CLBLAS_LIBRARIES) +# set(CLBLAS_FOUND TRUE PARENT_SCOPE) +#endif() + +#set(OCL_INCLUDE_DIR /opt/AMDAPPSDK-2.9-1/include) +#set(OCL_LIBRARIES /opt/AMDAPPSDK-2.9-1/lib/x86_64/libOpenCL.so) +set(CLBLAS_INCLUDE_DIR /opt/clBLAS-2.1/include) +set(CLBLAS_LIBRARIES /opt/clBLAS-2.1/lib64/libclBLAS.so) + + + diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index e094ac00..2d95b0a9 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -125,7 +125,9 @@ function(caffe_print_configuration_summary) caffe_status(" Snappy : " SNAPPY_FOUND THEN "Yes (ver. ${Snappy_VERSION})" ELSE "No" ) caffe_status(" LevelDB : " LEVELDB_FOUND THEN "Yes (ver. ${LEVELDB_VERSION})" ELSE "No") caffe_status(" OpenCV : Yes (ver. ${OpenCV_VERSION})") - caffe_status(" CUDA : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" ) +# caffe_status(" CUDA : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" ) + caffe_status(" OpenCL : " OPENCL_FOUND THEN "Yes" ELSE "No") + caffe_status(" clBLAS : " CLBLAS_FOUND THEN "Yes" ELSE "No") caffe_status("") if(HAVE_CUDA) caffe_status("NVIDIA CUDA:") diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in index 6039e8f6..ca9a3a9a 100644 --- a/cmake/Templates/caffe_config.h.in +++ b/cmake/Templates/caffe_config.h.in @@ -14,6 +14,10 @@ /* NVIDA cuDNN */ #cmakedefine CPU_ONLY +/* OpenCL & clBLAS*/ +#cmakedefine OCL_FOUND +#cmakedefine CLBLAS_FOUND + /* Test device */ #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE} diff --git a/examples/imagenet/train_alexnet.sh b/examples/imagenet/train_alexnet.sh new file mode 100755 index 00000000..58e5229f --- /dev/null +++ b/examples/imagenet/train_alexnet.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env sh + +GLOG_logtostderr=0 ./build/tools/caffe train \ + --solver=models/bvlc_alexnet/solver.prototxt diff --git a/examples/imagenet/train_alexnet_cpu.sh b/examples/imagenet/train_alexnet_cpu.sh new file mode 100755 index 00000000..a86f75fe --- /dev/null +++ b/examples/imagenet/train_alexnet_cpu.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env sh + +./build/tools/caffe train \ + --solver=models/bvlc_alexnet/solver_cpu.prototxt diff --git a/examples/imagenet/train_alexnet_without_dropout.sh b/examples/imagenet/train_alexnet_without_dropout.sh new file mode 100755 index 00000000..667543bf --- /dev/null +++ b/examples/imagenet/train_alexnet_without_dropout.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env sh + +GLOG_logtostderr=0 ./build/tools/caffe train \ + --solver=models/bvlc_alexnet/solver_without_dropout.prototxt diff --git a/examples/imagenet/train_alexnet_without_dropout_cpu.sh b/examples/imagenet/train_alexnet_without_dropout_cpu.sh new file mode 100755 index 00000000..12d43fc3 --- /dev/null +++ b/examples/imagenet/train_alexnet_without_dropout_cpu.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env sh + +GLOG_logtostderr=0 ./build/tools/caffe train \ + --solver=models/bvlc_alexnet/solver_without_dropout_cpu.prototxt diff --git a/examples/imagenet/train_caffenet_cpu.sh b/examples/imagenet/train_caffenet_cpu.sh new file mode 100755 index 00000000..4bcebf36 --- /dev/null +++ b/examples/imagenet/train_caffenet_cpu.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env sh + +./build/tools/caffe train \ + --solver=models/bvlc_reference_caffenet/solver_cpu.prototxt diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index 472cc184..9f22a082 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -23,258 +23,279 @@ namespace caffe { */ template class Blob { - public: - Blob() - : data_(), diff_(), count_(0), capacity_(0) {} + public: + Blob() + : data_(), diff_(), count_(0), capacity_(0) { + } - /// @brief Deprecated; use Blob(const vector& shape). - explicit Blob(const int num, const int channels, const int height, - const int width); - explicit Blob(const vector& shape); + /// @brief Deprecated; use Blob(const vector& shape). + explicit Blob(const int num, const int channels, const int height, + const int width); + explicit Blob(const vector& shape); - /// @brief Deprecated; use Reshape(const vector& shape). - void Reshape(const int num, const int channels, const int height, - const int width); - /** - * @brief Change the dimensions of the blob, allocating new memory if - * necessary. - * - * This function can be called both to create an initial allocation - * of memory, and to adjust the dimensions of a top blob during Layer::Reshape - * or Layer::Forward. When changing the size of blob, memory will only be - * reallocated if sufficient memory does not already exist, and excess memory - * will never be freed. - * - * Note that reshaping an input blob and immediately calling Net::Backward is - * an error; either Net::Forward or Net::Reshape need to be called to - * propagate the new input shape to higher layers. - */ - void Reshape(const vector& shape); - void Reshape(const BlobShape& shape); - void ReshapeLike(const Blob& other); - inline string shape_string() const { - ostringstream stream; - for (int i = 0; i < shape_.size(); ++i) { - stream << shape_[i] << " "; + /// @brief Deprecated; use Reshape(const vector& shape). + void Reshape(const int num, const int channels, const int height, + const int width); + /** + * @brief Change the dimensions of the blob, allocating new memory if + * necessary. + * + * This function can be called both to create an initial allocation + * of memory, and to adjust the dimensions of a top blob during Layer::Reshape + * or Layer::Forward. When changing the size of blob, memory will only be + * reallocated if sufficient memory does not already exist, and excess memory + * will never be freed. + * + * Note that reshaping an input blob and immediately calling Net::Backward is + * an error; either Net::Forward or Net::Reshape need to be called to + * propagate the new input shape to higher layers. + */ + void Reshape(const vector& shape); + void Reshape(const BlobShape& shape); + void ReshapeLike(const Blob& other); + inline string shape_string() const { + ostringstream stream; + for (int i = 0; i < shape_.size(); ++i) { + stream << shape_[i] << " "; + } + stream << "(" << count_ << ")"; + return stream.str(); + } + inline const vector& shape() const { + return shape_; + } + /** + * @brief Returns the dimension of the index-th axis (or the negative index-th + * axis from the end, if index is negative). + * + * @param index the axis index, which may be negative as it will be + * "canonicalized" using CanonicalAxisIndex. + * Dies on out of range index. + */ + inline int shape(int index) const { + return shape_[CanonicalAxisIndex(index)]; + } + inline int num_axes() const { + return shape_.size(); + } + inline int count() const { + return count_; } - stream << "(" << count_ << ")"; - return stream.str(); - } - inline const vector& shape() const { return shape_; } - /** - * @brief Returns the dimension of the index-th axis (or the negative index-th - * axis from the end, if index is negative). - * - * @param index the axis index, which may be negative as it will be - * "canonicalized" using CanonicalAxisIndex. - * Dies on out of range index. - */ - inline int shape(int index) const { - return shape_[CanonicalAxisIndex(index)]; - } - inline int num_axes() const { return shape_.size(); } - inline int count() const { return count_; } - /** - * @brief Compute the volume of a slice; i.e., the product of dimensions - * among a range of axes. - * - * @param start_axis The first axis to include in the slice. - * - * @param end_axis The first axis to exclude from the slice. - */ - inline int count(int start_axis, int end_axis) const { - CHECK_LE(start_axis, end_axis); - CHECK_GE(start_axis, 0); - CHECK_GE(end_axis, 0); - CHECK_LE(start_axis, num_axes()); - CHECK_LE(end_axis, num_axes()); - int count = 1; - for (int i = start_axis; i < end_axis; ++i) { - count *= shape(i); + /** + * @brief Compute the volume of a slice; i.e., the product of dimensions + * among a range of axes. + * + * @param start_axis The first axis to include in the slice. + * + * @param end_axis The first axis to exclude from the slice. + */ + inline int count(int start_axis, int end_axis) const { + CHECK_LE(start_axis, end_axis); + CHECK_GE(start_axis, 0); + CHECK_GE(end_axis, 0); + CHECK_LE(start_axis, num_axes()); + CHECK_LE(end_axis, num_axes()); + int count = 1; + for (int i = start_axis; i < end_axis; ++i) { + count *= shape(i); + } + return count; + } + /** + * @brief Compute the volume of a slice spanning from a particular first + * axis to the final axis. + * + * @param start_axis The first axis to include in the slice. + */ + inline int count(int start_axis) const { + return count(start_axis, num_axes()); } - return count; - } - /** - * @brief Compute the volume of a slice spanning from a particular first - * axis to the final axis. - * - * @param start_axis The first axis to include in the slice. - */ - inline int count(int start_axis) const { - return count(start_axis, num_axes()); - } - /** - * @brief Returns the 'canonical' version of a (usually) user-specified axis, - * allowing for negative indexing (e.g., -1 for the last axis). - * - * @param index the axis index. - * If 0 <= index < num_axes(), return index. - * If -num_axes <= index <= -1, return (num_axes() - (-index)), - * e.g., the last axis index (num_axes() - 1) if index == -1, - * the second to last if index == -2, etc. - * Dies on out of range index. - */ - inline int CanonicalAxisIndex(int axis_index) const { - CHECK_GE(axis_index, -num_axes()) - << "axis " << axis_index << " out of range for " << num_axes() - << "-D Blob with shape " << shape_string(); - CHECK_LT(axis_index, num_axes()) - << "axis " << axis_index << " out of range for " << num_axes() - << "-D Blob with shape " << shape_string(); - if (axis_index < 0) { - return axis_index + num_axes(); + /** + * @brief Returns the 'canonical' version of a (usually) user-specified axis, + * allowing for negative indexing (e.g., -1 for the last axis). + * + * @param index the axis index. + * If 0 <= index < num_axes(), return index. + * If -num_axes <= index <= -1, return (num_axes() - (-index)), + * e.g., the last axis index (num_axes() - 1) if index == -1, + * the second to last if index == -2, etc. + * Dies on out of range index. + */ + inline int CanonicalAxisIndex(int axis_index) const { + CHECK_GE(axis_index, -num_axes()) << "axis " << axis_index + << " out of range for " << num_axes() << "-D Blob with shape " + << shape_string(); + CHECK_LT(axis_index, num_axes()) << "axis " << axis_index + << " out of range for " << num_axes() << "-D Blob with shape " + << shape_string(); + if (axis_index < 0) { + return axis_index + num_axes(); + } + return axis_index; } - return axis_index; - } - /// @brief Deprecated legacy shape accessor num: use shape(0) instead. - inline int num() const { return LegacyShape(0); } - /// @brief Deprecated legacy shape accessor channels: use shape(1) instead. - inline int channels() const { return LegacyShape(1); } - /// @brief Deprecated legacy shape accessor height: use shape(2) instead. - inline int height() const { return LegacyShape(2); } - /// @brief Deprecated legacy shape accessor width: use shape(3) instead. - inline int width() const { return LegacyShape(3); } - inline int LegacyShape(int index) const { - CHECK_LE(num_axes(), 4) - << "Cannot use legacy accessors on Blobs with > 4 axes."; - CHECK_LT(index, 4); - CHECK_GE(index, -4); - if (index >= num_axes() || index < -num_axes()) { - // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse - // indexing) -- this special case simulates the one-padding used to fill - // extraneous axes of legacy blobs. - return 1; + /// @brief Deprecated legacy shape accessor num: use shape(0) instead. + inline int num() const { + return LegacyShape(0); + } + /// @brief Deprecated legacy shape accessor channels: use shape(1) instead. + inline int channels() const { + return LegacyShape(1); + } + /// @brief Deprecated legacy shape accessor height: use shape(2) instead. + inline int height() const { + return LegacyShape(2); + } + /// @brief Deprecated legacy shape accessor width: use shape(3) instead. + inline int width() const { + return LegacyShape(3); + } + inline int LegacyShape(int index) const { + CHECK_LE(num_axes(), 4) + << "Cannot use legacy accessors on Blobs with > 4 axes."; + CHECK_LT(index, 4); + CHECK_GE(index, -4); + if (index >= num_axes() || index < -num_axes()) { + // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse + // indexing) -- this special case simulates the one-padding used to fill + // extraneous axes of legacy blobs. + return 1; + } + return shape(index); } - return shape(index); - } - inline int offset(const int n, const int c = 0, const int h = 0, - const int w = 0) const { - CHECK_GE(n, 0); - CHECK_LE(n, num()); - CHECK_GE(channels(), 0); - CHECK_LE(c, channels()); - CHECK_GE(height(), 0); - CHECK_LE(h, height()); - CHECK_GE(width(), 0); - CHECK_LE(w, width()); - return ((n * channels() + c) * height() + h) * width() + w; - } + inline int offset(const int n, const int c = 0, const int h = 0, + const int w = 0) const { + CHECK_GE(n, 0); + CHECK_LE(n, num()); + CHECK_GE(channels(), 0); + CHECK_LE(c, channels()); + CHECK_GE(height(), 0); + CHECK_LE(h, height()); + CHECK_GE(width(), 0); + CHECK_LE(w, width()); + return ((n * channels() + c) * height() + h) * width() + w; + } - inline int offset(const vector& indices) const { - CHECK_LE(indices.size(), num_axes()); - int offset = 0; - for (int i = 0; i < num_axes(); ++i) { - offset *= shape(i); - if (indices.size() > i) { - CHECK_GE(indices[i], 0); - CHECK_LT(indices[i], shape(i)); - offset += indices[i]; + inline int offset(const vector& indices) const { + CHECK_LE(indices.size(), num_axes()); + int offset = 0; + for (int i = 0; i < num_axes(); ++i) { + offset *= shape(i); + if (indices.size() > i) { + CHECK_GE(indices[i], 0); + CHECK_LT(indices[i], shape(i)); + offset += indices[i]; + } } + return offset; } - return offset; - } - /** - * @brief Copy from a source Blob. - * - * @param source the Blob to copy from - * @param copy_diff if false, copy the data; if true, copy the diff - * @param reshape if false, require this Blob to be pre-shaped to the shape - * of other (and die otherwise); if true, Reshape this Blob to other's - * shape if necessary - */ - void CopyFrom(const Blob& source, bool copy_diff = false, - bool reshape = false); + /** + * @brief Copy from a source Blob. + * + * @param source the Blob to copy from + * @param copy_diff if false, copy the data; if true, copy the diff + * @param reshape if false, require this Blob to be pre-shaped to the shape + * of other (and die otherwise); if true, Reshape this Blob to other's + * shape if necessary + */ + void CopyFrom(const Blob& source, bool copy_diff = false, + bool reshape = false); - inline Dtype data_at(const int n, const int c, const int h, - const int w) const { - return cpu_data()[offset(n, c, h, w)]; - } + inline Dtype data_at(const int n, const int c, const int h, + const int w) const { + return cpu_data()[offset(n, c, h, w)]; + } - inline Dtype diff_at(const int n, const int c, const int h, - const int w) const { - return cpu_diff()[offset(n, c, h, w)]; - } + inline Dtype diff_at(const int n, const int c, const int h, + const int w) const { + return cpu_diff()[offset(n, c, h, w)]; + } - inline Dtype data_at(const vector& index) const { - return cpu_data()[offset(index)]; - } + inline Dtype data_at(const vector& index) const { + return cpu_data()[offset(index)]; + } - inline Dtype diff_at(const vector& index) const { - return cpu_diff()[offset(index)]; - } + inline Dtype diff_at(const vector& index) const { + return cpu_diff()[offset(index)]; + } - inline const shared_ptr& data() const { - CHECK(data_); - return data_; - } + inline const shared_ptr& data() const { + CHECK(data_); + return data_; + } - inline const shared_ptr& diff() const { - CHECK(diff_); - return diff_; - } + inline const shared_ptr& diff() const { + CHECK(diff_); + return diff_; + } - const Dtype* cpu_data() const; - void set_cpu_data(Dtype* data); - const Dtype* gpu_data() const; - const Dtype* cpu_diff() const; - const Dtype* gpu_diff() const; - Dtype* mutable_cpu_data(); - Dtype* mutable_gpu_data(); - Dtype* mutable_cpu_diff(); - Dtype* mutable_gpu_diff(); - void Update(); - void FromProto(const BlobProto& proto, bool reshape = true); - void ToProto(BlobProto* proto, bool write_diff = false) const; + const Dtype* cpu_data() const; + void set_cpu_data(Dtype* data); + const Dtype* gpu_data() const; + const Dtype* gpu_cache_data() const; + const Dtype* cpu_diff() const; + const Dtype* gpu_diff() const; + Dtype* mutable_cpu_data(); + Dtype* mutable_gpu_data(); + Dtype* mutable_cpu_diff(); + Dtype* mutable_gpu_diff(); + void Update(); + void FromProto(const BlobProto& proto, bool reshape = true); + void ToProto(BlobProto* proto, bool write_diff = false) const; - /// @brief Compute the sum of absolute values (L1 norm) of the data. - Dtype asum_data() const; - /// @brief Compute the sum of absolute values (L1 norm) of the diff. - Dtype asum_diff() const; - /// @brief Compute the sum of squares (L2 norm squared) of the data. - Dtype sumsq_data() const; - /// @brief Compute the sum of squares (L2 norm squared) of the diff. - Dtype sumsq_diff() const; + /// @brief Compute the sum of absolute values (L1 norm) of the data. + Dtype asum_data() const; + /// @brief Compute the sum of absolute values (L1 norm) of the diff. + Dtype asum_diff() const; + /// @brief Compute the sum of squares (L2 norm squared) of the data. + Dtype sumsq_data() const; + /// @brief Compute the sum of squares (L2 norm squared) of the diff. + Dtype sumsq_diff() const; - /// @brief Scale the blob data by a constant factor. - void scale_data(Dtype scale_factor); - /// @brief Scale the blob diff by a constant factor. - void scale_diff(Dtype scale_factor); + /// @brief Scale the blob data by a constant factor. + void scale_data(Dtype scale_factor); + /// @brief Scale the blob diff by a constant factor. + void scale_diff(Dtype scale_factor); - /** - * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the - * data_ of Blob other -- useful in Layer%s which simply perform a copy - * in their Forward pass. - * - * This deallocates the SyncedMemory holding this Blob's data_, as - * shared_ptr calls its destructor when reset with the "=" operator. - */ - void ShareData(const Blob& other); - /** - * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the - * diff_ of Blob other -- useful in Layer%s which simply perform a copy - * in their Forward pass. - * - * This deallocates the SyncedMemory holding this Blob's diff_, as - * shared_ptr calls its destructor when reset with the "=" operator. - */ - void ShareDiff(const Blob& other); + /** + * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the + * data_ of Blob other -- useful in Layer%s which simply perform a copy + * in their Forward pass. + * + * This deallocates the SyncedMemory holding this Blob's data_, as + * shared_ptr calls its destructor when reset with the "=" operator. + */ + void ShareData(const Blob& other); + /** + * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the + * diff_ of Blob other -- useful in Layer%s which simply perform a copy + * in their Forward pass. + * + * This deallocates the SyncedMemory holding this Blob's diff_, as + * shared_ptr calls its destructor when reset with the "=" operator. + */ + void ShareDiff(const Blob& other); + void set_data_layer() { + data_->set_data_layer(); + diff_->set_data_layer(); + } - bool ShapeEquals(const BlobProto& other); + bool ShapeEquals(const BlobProto& other); - protected: - shared_ptr data_; - shared_ptr diff_; - vector shape_; - int count_; - int capacity_; + protected: + shared_ptr data_; + shared_ptr diff_; + vector shape_; + int count_; + int capacity_; - DISABLE_COPY_AND_ASSIGN(Blob); -}; // class Blob + DISABLE_COPY_AND_ASSIGN (Blob); +}; +// class Blob -} // namespace caffe +}// namespace caffe #endif // CAFFE_BLOB_HPP_ diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp index 5f86bc26..0b455c59 100644 --- a/include/caffe/common.hpp +++ b/include/caffe/common.hpp @@ -16,7 +16,17 @@ #include // pair #include +#ifndef CPU_ONLY +#include +#include +#include +#endif + +#include "caffe/device.hpp" #include "caffe/util/device_alternate.hpp" +#include "caffe/util/ocl_wrapper.hpp" +#include "caffe/util/ocl_util.hpp" +#include "caffe/util/im2col.hpp" // gflags 2.1 issue: namespace google was changed to gflags without warning. // Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version @@ -64,9 +74,91 @@ private:\ // A simple macro to mark codes that are not implemented, so that when the code // is executed we will see a fatal log. #define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet" +//OpenCL: various of defines to choose the design schemes +/* ifdef: use CPU random generator in dropout layer + ifndef: use GPU random generator*/ +//#define use_cpu_generator_dropout +//#define print_memory_trace +//the following are macro defines for optimization schmes in conv layer +/*ifdef: use proposed img_packing scheme; + ifndef: use proposed packing im2col + sgemm scheme*/ +#define use_packing_scheme 0 +/* global_packing_N defines packing number of the use_packing scheme + for intial design, we use the same packing number for all conv layers*/ +#define global_packing_N 16 +/*ifdef: use multi-command queues for groups in conv layer; + ifndef: use single commane queue for groups*/ +//#define multiQ +//#define check_gradient +// OpenCL: various checks for different function calls. +#define OCL_CHECK(condition) \ + do { \ + cl_int error = condition; \ + CHECK_EQ(error, CL_SUCCESS) << " " << error; \ + if(CL_SUCCESS != error){ \ + LOG(INFO) << "failed";\ + } \ + } while (0) + +#define CLBLAS_CHECK(flag) \ + do { \ + cl_int error = flag; \ + CHECK_EQ(error, clblasSuccess) << " " << error; \ + if (error != clblasSuccess){ \ + LOG(INFO) << "clBlas Function Failed! Error Code:" << error; \ + } \ + } while(0) + +//sample #num data from Blob_ +#define CHECK_BLOB_DATA(Blob_, num, marker) \ +do{ \ + const Dtype *top_cpu_data = Blob_->cpu_data(); \ + size_t top_cpu_data_count = Blob_->count(); \ + size_t sample_interval = top_cpu_data_count/num; \ + if(sample_interval == 0){ \ + sample_interval=1; \ + } \ + printf("%s: ", marker); \ + for(int i=0; i generator_; - }; - - // Getters for boost rng, curand, and cublas handles - inline static RNG& rng_stream() { - if (!Get().random_generator_) { - Get().random_generator_.reset(new RNG()); + enum Brew { + CPU, GPU, APU + }; + + // This random number generator facade hides boost and CUDA rng + // implementation from one another (for cross-platform compatibility). + class RNG { + public: + RNG(); + explicit RNG(unsigned int seed); + explicit RNG(const RNG&); + RNG& operator=(const RNG&); + void* generator(); + private: + class Generator; + shared_ptr generator_; + }; + + // Getters for boost rng, curand, and cublas handles + inline static RNG& rng_stream() { + if (!Get().random_generator_) { + Get().random_generator_.reset(new RNG()); + } + return *(Get().random_generator_); } - return *(Get().random_generator_); - } #ifndef CPU_ONLY - inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; } - inline static curandGenerator_t curand_generator() { - return Get().curand_generator_; - } + //inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; } + //inline static curandGenerator_t curand_generator() { + // return Get().curand_generator_; + //} #endif - // Returns the mode: running on CPU or GPU. - inline static Brew mode() { return Get().mode_; } - // The setters for the variables - // Sets the mode. It is recommended that you don't change the mode halfway - // into the program since that may cause allocation of pinned memory being - // freed in a non-pinned way, which may cause problems - I haven't verified - // it personally but better to note it here in the header file. - inline static void set_mode(Brew mode) { Get().mode_ = mode; } - // Sets the random seed of both boost and curand - static void set_random_seed(const unsigned int seed); - // Sets the device. Since we have cublas and curand stuff, set device also - // requires us to reset those values. - static void SetDevice(const int device_id); - // Prints the current GPU status. - static void DeviceQuery(); - - protected: + // Returns the mode: running on CPU or GPU. + inline static Brew mode() { + return Get().mode_; + } + // The setters for the variables + // Sets the mode. It is recommended that you don't change the mode halfway + // into the program since that may cause allocation of pinned memory being + // freed in a non-pinned way, which may cause problems - I haven't verified + // it personally but better to note it here in the header file. + inline static void set_mode(Brew mode) { + Get().mode_ = mode; + } + // Sets the random seed of both boost and curand + static void set_random_seed(const unsigned int seed); + // Sets the device. Since we have cublas and curand stuff, set device also + // requires us to reset those values. + static void SetDevice(const int device_id); + // Prints the current GPU status. + static void DeviceQuery(); + + protected: #ifndef CPU_ONLY - cublasHandle_t cublas_handle_; - curandGenerator_t curand_generator_; + //cublasHandle_t cublas_handle_; + //curandGenerator_t curand_generator_; #endif - shared_ptr random_generator_; + shared_ptr random_generator_; - Brew mode_; - static shared_ptr singleton_; + Brew mode_; + static shared_ptr singleton_; - private: - // The private constructor to avoid duplicate instantiation. - Caffe(); + private: + // The private constructor to avoid duplicate instantiation. + Caffe(); DISABLE_COPY_AND_ASSIGN(Caffe); }; diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index d2c0ce6d..ab796286 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -26,48 +26,56 @@ namespace caffe { * NOTE: does not implement Backwards operation. */ template -class ArgMaxLayer : public Layer { - public: - /** - * @param param provides ArgMaxParameter argmax_param, - * with ArgMaxLayer options: - * - top_k (\b optional uint, default 1). - * the number @f$ K @f$ of maximal items to output. - * - out_max_val (\b optional bool, default false). - * if set, output a vector of pairs (max_ind, max_val) for each image. - */ - explicit ArgMaxLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "ArgMax"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val - * @f$ (N \times 2 \times K \times 1) @f$ - * the computed outputs @f$ - * y_n = \arg\max\limits_i x_{ni} - * @f$ (for @f$ K = 1 @f$). - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - /// @brief Not implemented (non-differentiable function) - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - NOT_IMPLEMENTED; - } - bool out_max_val_; - size_t top_k_; +class ArgMaxLayer: public Layer { + public: + /** + * @param param provides ArgMaxParameter argmax_param, + * with ArgMaxLayer options: + * - top_k (\b optional uint, default 1). + * the number @f$ K @f$ of maximal items to output. + * - out_max_val (\b optional bool, default false). + * if set, output a vector of pairs (max_ind, max_val) for each image. + */ + explicit ArgMaxLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "ArgMax"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times 1 \times K \times 1) @f$ or, if out_max_val + * @f$ (N \times 2 \times K \times 1) @f$ + * the computed outputs @f$ + * y_n = \arg\max\limits_i x_{ni} + * @f$ (for @f$ K = 1 @f$). + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + /// @brief Not implemented (non-differentiable function) + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + NOT_IMPLEMENTED; + } + bool out_max_val_; + size_t top_k_; }; /** @@ -75,72 +83,79 @@ class ArgMaxLayer : public Layer { * or channel dimension, outputting the result. */ template -class ConcatLayer : public Layer { - public: - explicit ConcatLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Concat"; } - virtual inline int MinBottomBlobs() const { return 2; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - /** - * @param bottom input Blob vector (length 2+) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x_1 @f$ - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x_2 @f$ - * -# ... - * - K @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x_K @f$ - * @param top output Blob vector (length 1) - * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or - * @f$ (N \times KC \times H \times W) @f$ if axis == 1: - * the concatenated output @f$ - * y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}] - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the concatenate inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or - * @f$ (N \times KC \times H \times W) @f$ if axis == 1: - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to concatenated outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length K), into which the top gradient - * @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the - * inputs @f$ - * \left[ \begin{array}{cccc} - * \frac{\partial E}{\partial x_1} & - * \frac{\partial E}{\partial x_2} & - * ... & - * \frac{\partial E}{\partial x_K} - * \end{array} \right] = - * \frac{\partial E}{\partial y} - * @f$ - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int count_; - int num_concats_; - int concat_input_size_; - int concat_axis_; +class ConcatLayer: public Layer { + public: + explicit ConcatLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Concat"; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 2+) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x_1 @f$ + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x_2 @f$ + * -# ... + * - K @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x_K @f$ + * @param top output Blob vector (length 1) + * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or + * @f$ (N \times KC \times H \times W) @f$ if axis == 1: + * the concatenated output @f$ + * y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}] + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the concatenate inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or + * @f$ (N \times KC \times H \times W) @f$ if axis == 1: + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to concatenated outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length K), into which the top gradient + * @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the + * inputs @f$ + * \left[ \begin{array}{cccc} + * \frac{\partial E}{\partial x_1} & + * \frac{\partial E}{\partial x_2} & + * ... & + * \frac{\partial E}{\partial x_K} + * \end{array} \right] = + * \frac{\partial E}{\partial y} + * @f$ + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int count_; + int num_concats_; + int concat_input_size_; + int concat_axis_; }; /** @@ -150,34 +165,41 @@ class ConcatLayer : public Layer { * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template -class EltwiseLayer : public Layer { - public: - explicit EltwiseLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Eltwise"; } - virtual inline int MinBottomBlobs() const { return 2; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - EltwiseParameter_EltwiseOp op_; - vector coeffs_; - Blob max_idx_; - - bool stable_prod_grad_; +class EltwiseLayer: public Layer { + public: + explicit EltwiseLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Eltwise"; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + EltwiseParameter_EltwiseOp op_; + vector coeffs_; + Blob max_idx_; + + bool stable_prod_grad_; }; /** @@ -187,60 +209,67 @@ class EltwiseLayer : public Layer { * item needs to stay). */ template -class FilterLayer : public Layer { - public: - explicit FilterLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Filter"; } - virtual inline int MinBottomBlobs() const { return 2; } - virtual inline int MinTopBlobs() const { return 1; } - - protected: - /** - * @param bottom input Blob vector (length 2+) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs to be filtered @f$ x_1 @f$ - * -# ... - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs to be filtered @f$ x_K @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the selector blob - * @param top output Blob vector (length 1+) - * -# @f$ (S \times C \times H \times W) @f$ () - * the filtered output @f$ x_1 @f$ - * where S is the number of items - * that haven't been filtered - * @f$ (S \times C \times H \times W) @f$ - * the filtered output @f$ x_K @f$ - * where S is the number of items - * that haven't been filtered - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the forwarded inputs. - * - * @param top output Blob vector (length 1+), providing the error gradient with - * respect to the outputs - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2+), into which the top error - * gradient is copied - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool first_reshape_; - vector indices_to_forward_; +class FilterLayer: public Layer { + public: + explicit FilterLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Filter"; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 2+) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs to be filtered @f$ x_1 @f$ + * -# ... + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs to be filtered @f$ x_K @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the selector blob + * @param top output Blob vector (length 1+) + * -# @f$ (S \times C \times H \times W) @f$ () + * the filtered output @f$ x_1 @f$ + * where S is the number of items + * that haven't been filtered + * @f$ (S \times C \times H \times W) @f$ + * the filtered output @f$ x_K @f$ + * where S is the number of items + * that haven't been filtered + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the forwarded inputs. + * + * @param top output Blob vector (length 1+), providing the error gradient with + * respect to the outputs + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2+), into which the top error + * gradient is copied + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool first_reshape_; + vector indices_to_forward_; }; /** @@ -254,40 +283,47 @@ class FilterLayer : public Layer { * (see Blob::ShareDiff). */ template -class FlattenLayer : public Layer { - public: - explicit FlattenLayer(const LayerParameter& param) - : Layer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Flatten"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - /** - * @param bottom input Blob vector (length 2+) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs - * @param top output Blob vector (length 1) - * -# @f$ (N \times CHW \times 1 \times 1) @f$ - * the outputs -- i.e., the (virtually) copied, flattened inputs - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the concatenate inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length K), into which the top error - * gradient is (virtually) copied - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +class FlattenLayer: public Layer { + public: + explicit FlattenLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Flatten"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /** + * @param bottom input Blob vector (length 2+) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs + * @param top output Blob vector (length 1) + * -# @f$ (N \times CHW \times 1 \times 1) @f$ + * the outputs -- i.e., the (virtually) copied, flattened inputs + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the concatenate inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length K), into which the top error + * gradient is (virtually) copied + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -297,34 +333,41 @@ class FlattenLayer : public Layer { * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template -class InnerProductLayer : public Layer { - public: - explicit InnerProductLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "InnerProduct"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int M_; - int K_; - int N_; - bool bias_term_; - Blob bias_multiplier_; +class InnerProductLayer: public Layer { + public: + explicit InnerProductLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "InnerProduct"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int M_; + int K_; + int N_; + bool bias_term_; + Blob bias_multiplier_; }; /** @@ -333,32 +376,39 @@ class InnerProductLayer : public Layer { * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template -class MVNLayer : public Layer { - public: - explicit MVNLayer(const LayerParameter& param) - : Layer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "MVN"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob mean_, variance_, temp_; - - /// sum_multiplier is used to carry out sum using BLAS - Blob sum_multiplier_; - Dtype eps_; +class MVNLayer: public Layer { + public: + explicit MVNLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "MVN"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob mean_, variance_, temp_; + + /// sum_multiplier is used to carry out sum using BLAS + Blob sum_multiplier_; + Dtype eps_; }; /* @@ -368,35 +418,48 @@ class MVNLayer : public Layer { * (see FlattenLayer, Blob::ShareData and Blob::ShareDiff). */ template -class ReshapeLayer : public Layer { - public: - explicit ReshapeLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Reshape"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) {} - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - - /// @brief vector of axes indices whose dimensions we'll copy from the bottom - vector copy_axes_; - /// @brief the index of the axis whose dimension we infer, or -1 if none - int inferred_axis_; - /// @brief the product of the "constant" output dimensions - int constant_count_; +class ReshapeLayer: public Layer { + public: + explicit ReshapeLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Reshape"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) { + } + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + + /// @brief vector of axes indices whose dimensions we'll copy from the bottom + vector copy_axes_; + /// @brief the index of the axis whose dimension we infer, or -1 if none + int inferred_axis_; + /// @brief the product of the "constant" output dimensions + int constant_count_; }; /** @@ -407,41 +470,48 @@ class ReshapeLayer : public Layer { * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template -class ReductionLayer : public Layer { - public: - explicit ReductionLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Reduction"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// @brief the reduction operation performed by the layer - ReductionParameter_ReductionOp op_; - /// @brief a scalar coefficient applied to all outputs - Dtype coeff_; - /// @brief the index of the first input axis to reduce - int axis_; - /// @brief the number of reductions performed - int num_; - /// @brief the input size of each reduction - int dim_; - /// @brief a helper Blob used for summation (op_ == SUM) - Blob sum_multiplier_; +class ReductionLayer: public Layer { + public: + explicit ReductionLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Reduction"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// @brief the reduction operation performed by the layer + ReductionParameter_ReductionOp op_; + /// @brief a scalar coefficient applied to all outputs + Dtype coeff_; + /// @brief the index of the first input axis to reduce + int axis_; + /// @brief the number of reductions performed + int num_; + /// @brief the input size of each reduction + int dim_; + /// @brief a helper Blob used for summation (op_ == SUM) + Blob sum_multiplier_; }; /** @@ -449,28 +519,37 @@ class ReductionLayer : public Layer { * to suppress outputs during testing.) */ template -class SilenceLayer : public Layer { - public: - explicit SilenceLayer(const LayerParameter& param) - : Layer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top) {} - - virtual inline const char* type() const { return "Silence"; } - virtual inline int MinBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 0; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) {} - // We can't define Forward_gpu here, since STUB_GPU will provide - // its own definition for CPU_ONLY mode. - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +class SilenceLayer: public Layer { + public: + explicit SilenceLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "Silence"; + } + virtual inline int MinBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 0; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) { + } + // We can't define Forward_gpu here, since STUB_GPU will provide + // its own definition for CPU_ONLY mode. + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -479,34 +558,42 @@ class SilenceLayer : public Layer { * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template -class SoftmaxLayer : public Layer { - public: - explicit SoftmaxLayer(const LayerParameter& param) - : Layer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Softmax"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int outer_num_; - int inner_num_; - int softmax_axis_; - /// sum_multiplier is used to carry out sum using BLAS - Blob sum_multiplier_; - /// scale is an intermediate Blob to hold temporary results. - Blob scale_; +class SoftmaxLayer: public Layer { + public: + explicit SoftmaxLayer(const LayerParameter& param) + : Layer(param) { + } + ~SoftmaxLayer(); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Softmax"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int outer_num_; + int inner_num_; + int softmax_axis_; + /// sum_multiplier is used to carry out sum using BLAS + Blob sum_multiplier_; + /// scale is an intermediate Blob to hold temporary results. + Blob scale_; }; #ifdef USE_CUDNN @@ -516,23 +603,23 @@ class SoftmaxLayer : public Layer { */ template class CuDNNSoftmaxLayer : public SoftmaxLayer { - public: + public: explicit CuDNNSoftmaxLayer(const LayerParameter& param) - : SoftmaxLayer(param), handles_setup_(false) {} + : SoftmaxLayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, const vector*>& top); virtual ~CuDNNSoftmaxLayer(); - protected: + protected: virtual void Forward_gpu(const vector*>& bottom, const vector*>& top); virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom); bool handles_setup_; - cudnnHandle_t handle_; + cudnnHandle_t handle_; cudnnTensorDescriptor_t bottom_desc_; cudnnTensorDescriptor_t top_desc_; }; @@ -545,28 +632,36 @@ class CuDNNSoftmaxLayer : public SoftmaxLayer { * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template -class SplitLayer : public Layer { - public: - explicit SplitLayer(const LayerParameter& param) - : Layer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Split"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int count_; +class SplitLayer: public Layer { + public: + explicit SplitLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Split"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int count_; + cl_kernel gpu_add_kernel; }; /** @@ -576,34 +671,41 @@ class SplitLayer : public Layer { * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template -class SliceLayer : public Layer { - public: - explicit SliceLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Slice"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 2; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int count_; - int num_slices_; - int slice_size_; - int slice_axis_; - vector slice_point_; +class SliceLayer: public Layer { + public: + explicit SliceLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Slice"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 2; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int count_; + int num_slices_; + int slice_size_; + int slice_axis_; + vector slice_point_; }; } // namespace caffe diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 3958cb7e..d4f526b3 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -25,78 +25,95 @@ namespace caffe { * TODO(dox): thorough documentation for Forward and proto params. */ template -class BaseDataLayer : public Layer { - public: - explicit BaseDataLayer(const LayerParameter& param); - // LayerSetUp: implements common data layer setup functionality, and calls - // DataLayerSetUp to do special data layer setup for individual layer types. - // This method may not be overridden except by the BasePrefetchingDataLayer. - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top) {} - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) {} - - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - - protected: - TransformationParameter transform_param_; - shared_ptr > data_transformer_; - bool output_labels_; +class BaseDataLayer: public Layer { + public: + explicit BaseDataLayer(const LayerParameter& param); + // LayerSetUp: implements common data layer setup functionality, and calls + // DataLayerSetUp to do special data layer setup for individual layer types. + // This method may not be overridden except by the BasePrefetchingDataLayer. + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top) { + } + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + + protected: + TransformationParameter transform_param_; + shared_ptr > data_transformer_; + bool output_labels_; }; template -class BasePrefetchingDataLayer : - public BaseDataLayer, public InternalThread { - public: - explicit BasePrefetchingDataLayer(const LayerParameter& param) - : BaseDataLayer(param) {} - // LayerSetUp: implements common data layer setup functionality, and calls - // DataLayerSetUp to do special data layer setup for individual layer types. - // This method may not be overridden. - void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - virtual void CreatePrefetchThread(); - virtual void JoinPrefetchThread(); - // The thread's function - virtual void InternalThreadEntry() {} - - protected: - Blob prefetch_data_; - Blob prefetch_label_; - Blob transformed_data_; +class BasePrefetchingDataLayer: public BaseDataLayer, + public InternalThread { + public: + explicit BasePrefetchingDataLayer(const LayerParameter& param) + : BaseDataLayer(param) { + } + // LayerSetUp: implements common data layer setup functionality, and calls + // DataLayerSetUp to do special data layer setup for individual layer types. + // This method may not be overridden. + void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + virtual void CreatePrefetchThread(); + virtual void JoinPrefetchThread(); + // The thread's function + virtual void InternalThreadEntry() { + } + + protected: + Blob prefetch_data_; + Blob prefetch_label_; + Blob transformed_data_; }; template -class DataLayer : public BasePrefetchingDataLayer { - public: - explicit DataLayer(const LayerParameter& param) - : BasePrefetchingDataLayer(param) {} - virtual ~DataLayer(); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Data"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int MinTopBlobs() const { return 1; } - virtual inline int MaxTopBlobs() const { return 2; } - - protected: - virtual void InternalThreadEntry(); - - shared_ptr db_; - shared_ptr cursor_; +class DataLayer: public BasePrefetchingDataLayer { + public: + explicit DataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) { + } + virtual ~DataLayer(); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Data"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int MinTopBlobs() const { + return 1; + } + virtual inline int MaxTopBlobs() const { + return 2; + } + + protected: + virtual void InternalThreadEntry(); + + shared_ptr db_; + shared_ptr cursor_; }; /** @@ -105,30 +122,42 @@ class DataLayer : public BasePrefetchingDataLayer { * TODO(dox): thorough documentation for Forward and proto params. */ template -class DummyDataLayer : public Layer { - public: - explicit DummyDataLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) {} - - virtual inline const char* type() const { return "DummyData"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int MinTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - - vector > > fillers_; - vector refill_; +class DummyDataLayer: public Layer { + public: + explicit DummyDataLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "DummyData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + + vector > > fillers_; + vector refill_; }; /** @@ -137,39 +166,51 @@ class DummyDataLayer : public Layer { * TODO(dox): thorough documentation for Forward and proto params. */ template -class HDF5DataLayer : public Layer { - public: - explicit HDF5DataLayer(const LayerParameter& param) - : Layer(param) {} - virtual ~HDF5DataLayer(); - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) {} - - virtual inline const char* type() const { return "HDF5Data"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int MinTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) {} - virtual void LoadHDF5FileData(const char* filename); - - std::vector hdf_filenames_; - unsigned int num_files_; - unsigned int current_file_; - hsize_t current_row_; - std::vector > > hdf_blobs_; - std::vector data_permutation_; - std::vector file_permutation_; +class HDF5DataLayer: public Layer { + public: + explicit HDF5DataLayer(const LayerParameter& param) + : Layer(param) { + } + virtual ~HDF5DataLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "HDF5Data"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int MinTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + } + virtual void LoadHDF5FileData(const char* filename); + + std::vector hdf_filenames_; + unsigned int num_files_; + unsigned int current_file_; + hsize_t current_row_; + std::vector > > hdf_blobs_; + std::vector data_permutation_; + std::vector file_permutation_; }; /** @@ -178,40 +219,50 @@ class HDF5DataLayer : public Layer { * TODO(dox): thorough documentation for Forward and proto params. */ template -class HDF5OutputLayer : public Layer { - public: - explicit HDF5OutputLayer(const LayerParameter& param) - : Layer(param), file_opened_(false) {} - virtual ~HDF5OutputLayer(); - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - // Data layers have no bottoms, so reshaping is trivial. - virtual void Reshape(const vector*>& bottom, - const vector*>& top) {} - - virtual inline const char* type() const { return "HDF5Output"; } - // TODO: no limit on the number of blobs - virtual inline int ExactNumBottomBlobs() const { return 2; } - virtual inline int ExactNumTopBlobs() const { return 0; } - - inline std::string file_name() const { return file_name_; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void SaveBlobs(); - - bool file_opened_; - std::string file_name_; - hid_t file_id_; - Blob data_blob_; - Blob label_blob_; +class HDF5OutputLayer: public Layer { + public: + explicit HDF5OutputLayer(const LayerParameter& param) + : Layer(param), file_opened_(false) { + } + virtual ~HDF5OutputLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + // Data layers have no bottoms, so reshaping is trivial. + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + } + + virtual inline const char* type() const { + return "HDF5Output"; + } + // TODO: no limit on the number of blobs + virtual inline int ExactNumBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 0; + } + + inline std::string file_name() const { + return file_name_; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void SaveBlobs(); + + bool file_opened_; + std::string file_name_; + hid_t file_id_; + Blob data_blob_; + Blob label_blob_; }; /** @@ -220,25 +271,32 @@ class HDF5OutputLayer : public Layer { * TODO(dox): thorough documentation for Forward and proto params. */ template -class ImageDataLayer : public BasePrefetchingDataLayer { - public: - explicit ImageDataLayer(const LayerParameter& param) - : BasePrefetchingDataLayer(param) {} - virtual ~ImageDataLayer(); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "ImageData"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int ExactNumTopBlobs() const { return 2; } - - protected: - shared_ptr prefetch_rng_; - virtual void ShuffleImages(); - virtual void InternalThreadEntry(); - - vector > lines_; - int lines_id_; +class ImageDataLayer: public BasePrefetchingDataLayer { + public: + explicit ImageDataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) { + } + virtual ~ImageDataLayer(); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "ImageData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int ExactNumTopBlobs() const { + return 2; + } + + protected: + shared_ptr prefetch_rng_; + virtual void ShuffleImages(); + virtual void InternalThreadEntry(); + + vector > lines_; + int lines_id_; }; /** @@ -247,43 +305,58 @@ class ImageDataLayer : public BasePrefetchingDataLayer { * TODO(dox): thorough documentation for Forward and proto params. */ template -class MemoryDataLayer : public BaseDataLayer { - public: - explicit MemoryDataLayer(const LayerParameter& param) - : BaseDataLayer(param), has_new_data_(false) {} - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "MemoryData"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int ExactNumTopBlobs() const { return 2; } - - virtual void AddDatumVector(const vector& datum_vector); - virtual void AddMatVector(const vector& mat_vector, - const vector& labels); - - // Reset should accept const pointers, but can't, because the memory - // will be given to Blob, which is mutable - void Reset(Dtype* data, Dtype* label, int n); - void set_batch_size(int new_size); - - int batch_size() { return batch_size_; } - int channels() { return channels_; } - int height() { return height_; } - int width() { return width_; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - int batch_size_, channels_, height_, width_, size_; - Dtype* data_; - Dtype* labels_; - int n_; - size_t pos_; - Blob added_data_; - Blob added_label_; - bool has_new_data_; +class MemoryDataLayer: public BaseDataLayer { + public: + explicit MemoryDataLayer(const LayerParameter& param) + : BaseDataLayer(param), has_new_data_(false) { + } + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "MemoryData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int ExactNumTopBlobs() const { + return 2; + } + + virtual void AddDatumVector(const vector& datum_vector); + virtual void AddMatVector(const vector& mat_vector, + const vector& labels); + + // Reset should accept const pointers, but can't, because the memory + // will be given to Blob, which is mutable + void Reset(Dtype* data, Dtype* label, int n); + void set_batch_size(int new_size); + + int batch_size() { + return batch_size_; + } + int channels() { + return channels_; + } + int height() { + return height_; + } + int width() { + return width_; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + int batch_size_, channels_, height_, width_, size_; + Dtype* data_; + Dtype* labels_; + int n_; + size_t pos_; + Blob added_data_; + Blob added_label_; + bool has_new_data_; }; /** @@ -293,33 +366,42 @@ class MemoryDataLayer : public BaseDataLayer { * TODO(dox): thorough documentation for Forward and proto params. */ template -class WindowDataLayer : public BasePrefetchingDataLayer { - public: - explicit WindowDataLayer(const LayerParameter& param) - : BasePrefetchingDataLayer(param) {} - virtual ~WindowDataLayer(); - virtual void DataLayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "WindowData"; } - virtual inline int ExactNumBottomBlobs() const { return 0; } - virtual inline int ExactNumTopBlobs() const { return 2; } - - protected: - virtual unsigned int PrefetchRand(); - virtual void InternalThreadEntry(); - - shared_ptr prefetch_rng_; - vector > > image_database_; - enum WindowField { IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM }; - vector > fg_windows_; - vector > bg_windows_; - Blob data_mean_; - vector mean_values_; - bool has_mean_file_; - bool has_mean_values_; - bool cache_images_; - vector > image_database_cache_; +class WindowDataLayer: public BasePrefetchingDataLayer { + public: + explicit WindowDataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) { + } + virtual ~WindowDataLayer(); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "WindowData"; + } + virtual inline int ExactNumBottomBlobs() const { + return 0; + } + virtual inline int ExactNumTopBlobs() const { + return 2; + } + + protected: + virtual unsigned int PrefetchRand(); + virtual void InternalThreadEntry(); + + shared_ptr prefetch_rng_; + vector > > image_database_; + enum WindowField { + IMAGE_INDEX, LABEL, OVERLAP, X1, Y1, X2, Y2, NUM + }; + vector > fg_windows_; + vector > bg_windows_; + Blob data_mean_; + vector mean_values_; + bool has_mean_file_; + bool has_mean_values_; + bool cache_images_; + vector > image_database_cache_; }; } // namespace caffe diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp index 0ad68c80..daa4eee0 100644 --- a/include/caffe/data_transformer.hpp +++ b/include/caffe/data_transformer.hpp @@ -15,134 +15,134 @@ namespace caffe { */ template class DataTransformer { - public: - explicit DataTransformer(const TransformationParameter& param, Phase phase); - virtual ~DataTransformer() {} - - /** - * @brief Initialize the Random number generations if needed by the - * transformation. - */ - void InitRand(); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to the data. - * - * @param datum - * Datum containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See data_layer.cpp for an example. - */ - void Transform(const Datum& datum, Blob* transformed_blob); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to a vector of Datum. - * - * @param datum_vector - * A vector of Datum containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See memory_layer.cpp for an example. - */ - void Transform(const vector & datum_vector, - Blob* transformed_blob); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to a vector of Mat. - * - * @param mat_vector - * A vector of Mat containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See memory_layer.cpp for an example. - */ - void Transform(const vector & mat_vector, - Blob* transformed_blob); - - /** - * @brief Applies the transformation defined in the data layer's - * transform_param block to a cv::Mat - * - * @param cv_img - * cv::Mat containing the data to be transformed. - * @param transformed_blob - * This is destination blob. It can be part of top blob's data if - * set_cpu_data() is used. See image_data_layer.cpp for an example. - */ - void Transform(const cv::Mat& cv_img, Blob* transformed_blob); - - /** - * @brief Applies the same transformation defined in the data layer's - * transform_param block to all the num images in a input_blob. - * - * @param input_blob - * A Blob containing the data to be transformed. It applies the same - * transformation to all the num images in the blob. - * @param transformed_blob - * This is destination blob, it will contain as many images as the - * input blob. It can be part of top blob's data. - */ - void Transform(Blob* input_blob, Blob* transformed_blob); - - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * - * @param datum - * Datum containing the data to be transformed. - */ - vector InferBlobShape(const Datum& datum); - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * It uses the first element to infer the shape of the blob. - * - * @param datum_vector - * A vector of Datum containing the data to be transformed. - */ - vector InferBlobShape(const vector & datum_vector); - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * It uses the first element to infer the shape of the blob. - * - * @param mat_vector - * A vector of Mat containing the data to be transformed. - */ - vector InferBlobShape(const vector & mat_vector); - /** - * @brief Infers the shape of transformed_blob will have when - * the transformation is applied to the data. - * - * @param cv_img - * cv::Mat containing the data to be transformed. - */ - vector InferBlobShape(const cv::Mat& cv_img); - - protected: - /** - * @brief Generates a random integer from Uniform({0, 1, ..., n-1}). - * - * @param n - * The upperbound (exclusive) value of the random number. - * @return - * A uniformly random integer value from ({0, 1, ..., n-1}). - */ - virtual int Rand(int n); - - void Transform(const Datum& datum, Dtype* transformed_data); - // Tranformation parameters - TransformationParameter param_; - - - shared_ptr rng_; - Phase phase_; - Blob data_mean_; - vector mean_values_; + public: + explicit DataTransformer(const TransformationParameter& param, Phase phase); + virtual ~DataTransformer() { + } + + /** + * @brief Initialize the Random number generations if needed by the + * transformation. + */ + void InitRand(); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to the data. + * + * @param datum + * Datum containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See data_layer.cpp for an example. + */ + void Transform(const Datum& datum, Blob* transformed_blob); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to a vector of Datum. + * + * @param datum_vector + * A vector of Datum containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See memory_layer.cpp for an example. + */ + void Transform(const vector & datum_vector, + Blob* transformed_blob); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to a vector of Mat. + * + * @param mat_vector + * A vector of Mat containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See memory_layer.cpp for an example. + */ + void Transform(const vector & mat_vector, + Blob* transformed_blob); + + /** + * @brief Applies the transformation defined in the data layer's + * transform_param block to a cv::Mat + * + * @param cv_img + * cv::Mat containing the data to be transformed. + * @param transformed_blob + * This is destination blob. It can be part of top blob's data if + * set_cpu_data() is used. See image_data_layer.cpp for an example. + */ + void Transform(const cv::Mat& cv_img, Blob* transformed_blob); + + /** + * @brief Applies the same transformation defined in the data layer's + * transform_param block to all the num images in a input_blob. + * + * @param input_blob + * A Blob containing the data to be transformed. It applies the same + * transformation to all the num images in the blob. + * @param transformed_blob + * This is destination blob, it will contain as many images as the + * input blob. It can be part of top blob's data. + */ + void Transform(Blob* input_blob, Blob* transformed_blob); + + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * + * @param datum + * Datum containing the data to be transformed. + */ + vector InferBlobShape(const Datum& datum); + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * It uses the first element to infer the shape of the blob. + * + * @param datum_vector + * A vector of Datum containing the data to be transformed. + */ + vector InferBlobShape(const vector & datum_vector); + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * It uses the first element to infer the shape of the blob. + * + * @param mat_vector + * A vector of Mat containing the data to be transformed. + */ + vector InferBlobShape(const vector & mat_vector); + /** + * @brief Infers the shape of transformed_blob will have when + * the transformation is applied to the data. + * + * @param cv_img + * cv::Mat containing the data to be transformed. + */ + vector InferBlobShape(const cv::Mat& cv_img); + + protected: + /** + * @brief Generates a random integer from Uniform({0, 1, ..., n-1}). + * + * @param n + * The upperbound (exclusive) value of the random number. + * @return + * A uniformly random integer value from ({0, 1, ..., n-1}). + */ + virtual int Rand(int n); + + void Transform(const Datum& datum, Dtype* transformed_data); + // Tranformation parameters + TransformationParameter param_; + + shared_ptr rng_; + Phase phase_; + Blob data_mean_; + vector mean_values_; }; } // namespace caffe diff --git a/include/caffe/device.hpp b/include/caffe/device.hpp new file mode 100644 index 00000000..b6190f28 --- /dev/null +++ b/include/caffe/device.hpp @@ -0,0 +1,86 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +#ifndef CAFFE_DEVICE_HPP +#define CAFFE_DEVICE_HPP +#include +#include +#include "caffe/common.hpp" +namespace caffe { +#ifndef CPU_ONLY +class Device { + public: + Device() + : numPlatforms(0), numDevices(0), device_id(INT_MIN) { + } + ~Device(); + cl_uint numPlatforms; + cl_platform_id * platformIDs; + char platformName[64]; + char openclVersion[64]; + cl_uint numDevices; + cl_device_id * DeviceIDs; + + cl_context Context; + cl_command_queue CommandQueue; + cl_command_queue CommandQueue_helper; + cl_program Program; + cl_device_id * pDevices; + int device_id; + + clblasOrder col; + clblasOrder row; + std::map Kernels; + + cl_int Init(int device_id = -1); + cl_int ConvertToString(std::string pFileName, std::string &Str); + void DisplayPlatformInfo(); + void DisplayInfo(cl_platform_id id, cl_platform_info name, std::string str); + + void GetDeviceInfo(); + void DeviceQuery(); + int GetDevice() { + return device_id; + } + ; + void BuildProgram(std::string kernel_dir); + + template + void DisplayDeviceInfo(cl_device_id id, cl_device_info name, + std::string str); + template + void appendBitfield(T info, T value, std::string name, std::string &str); + + cl_kernel GetKernel(std::string kernel_name); + void ReleaseKernels(); +}; +extern std::string buildOption; +extern Device amdDevice; +#endif +} // namespace caffe + +#endif //CAFFE_DEVICE_HPP + diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp index 888f4a4b..ab9d6b39 100644 --- a/include/caffe/filler.hpp +++ b/include/caffe/filler.hpp @@ -18,111 +18,119 @@ namespace caffe { /// @brief Fills a Blob with constant or randomly-generated data. template class Filler { - public: - explicit Filler(const FillerParameter& param) : filler_param_(param) {} - virtual ~Filler() {} - virtual void Fill(Blob* blob) = 0; - protected: - FillerParameter filler_param_; -}; // class Filler - + public: + explicit Filler(const FillerParameter& param) + : filler_param_(param) { + } + virtual ~Filler() { + } + virtual void Fill(Blob* blob) = 0; + protected: + FillerParameter filler_param_; +}; +// class Filler /// @brief Fills a Blob with constant values @f$ x = 0 @f$. template -class ConstantFiller : public Filler { - public: - explicit ConstantFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); - const int count = blob->count(); - const Dtype value = this->filler_param_.value(); - CHECK(count); - for (int i = 0; i < count; ++i) { - data[i] = value; +class ConstantFiller: public Filler { + public: + explicit ConstantFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + Dtype* data = blob->mutable_cpu_data(); + const int count = blob->count(); + const Dtype value = this->filler_param_.value(); + CHECK(count); + for (int i = 0; i < count; ++i) { + data[i] = value; + } + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; } - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } }; /// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$. template -class UniformFiller : public Filler { - public: - explicit UniformFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - CHECK(blob->count()); - caffe_rng_uniform(blob->count(), Dtype(this->filler_param_.min()), - Dtype(this->filler_param_.max()), blob->mutable_cpu_data()); - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } +class UniformFiller: public Filler { + public: + explicit UniformFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK(blob->count()); + caffe_rng_uniform(blob->count(), Dtype(this->filler_param_.min()), + Dtype(this->filler_param_.max()), blob->mutable_cpu_data()); + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; + } }; /// @brief Fills a Blob with Gaussian-distributed values @f$ x = a @f$. template -class GaussianFiller : public Filler { - public: - explicit GaussianFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); - CHECK(blob->count()); - caffe_rng_gaussian(blob->count(), Dtype(this->filler_param_.mean()), - Dtype(this->filler_param_.std()), blob->mutable_cpu_data()); - int sparse = this->filler_param_.sparse(); - CHECK_GE(sparse, -1); - if (sparse >= 0) { - // Sparse initialization is implemented for "weight" blobs; i.e. matrices. - // These have num == channels == 1; width is number of inputs; height is - // number of outputs. The 'sparse' variable specifies the mean number - // of non-zero input weights for a given output. - CHECK_GE(blob->num_axes(), 1); - const int num_outputs = blob->shape(0); - Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs); - rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int))); - int* mask = reinterpret_cast(rand_vec_->mutable_cpu_data()); - caffe_rng_bernoulli(blob->count(), non_zero_probability, mask); - for (int i = 0; i < blob->count(); ++i) { - data[i] *= mask[i]; +class GaussianFiller: public Filler { + public: + explicit GaussianFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + Dtype* data = blob->mutable_cpu_data(); + CHECK(blob->count()); + caffe_rng_gaussian(blob->count(), + Dtype(this->filler_param_.mean()), Dtype(this->filler_param_.std()), + blob->mutable_cpu_data()); + int sparse = this->filler_param_.sparse(); + CHECK_GE(sparse, -1); + if (sparse >= 0) { + // Sparse initialization is implemented for "weight" blobs; i.e. matrices. + // These have num == channels == 1; width is number of inputs; height is + // number of outputs. The 'sparse' variable specifies the mean number + // of non-zero input weights for a given output. + CHECK_GE(blob->num_axes(), 1); + const int num_outputs = blob->shape(0); + Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs); + rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int))); + int* mask = reinterpret_cast(rand_vec_->mutable_cpu_data()); + caffe_rng_bernoulli(blob->count(), non_zero_probability, mask); + for (int i = 0; i < blob->count(); ++i) { + data[i] *= mask[i]; + } } } - } - protected: - shared_ptr rand_vec_; + protected: + shared_ptr rand_vec_; }; /** @brief Fills a Blob with values @f$ x \in [0, 1] @f$ * such that @f$ \forall i \sum_j x_{ij} = 1 @f$. */ template -class PositiveUnitballFiller : public Filler { - public: - explicit PositiveUnitballFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - Dtype* data = blob->mutable_cpu_data(); - DCHECK(blob->count()); - caffe_rng_uniform(blob->count(), 0, 1, blob->mutable_cpu_data()); - // We expect the filler to not be called very frequently, so we will - // just use a simple implementation - int dim = blob->count() / blob->num(); - CHECK(dim); - for (int i = 0; i < blob->num(); ++i) { - Dtype sum = 0; - for (int j = 0; j < dim; ++j) { - sum += data[i * dim + j]; - } - for (int j = 0; j < dim; ++j) { - data[i * dim + j] /= sum; +class PositiveUnitballFiller: public Filler { + public: + explicit PositiveUnitballFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + Dtype* data = blob->mutable_cpu_data(); + DCHECK(blob->count()); + caffe_rng_uniform(blob->count(), 0, 1, blob->mutable_cpu_data()); + // We expect the filler to not be called very frequently, so we will + // just use a simple implementation + int dim = blob->count() / blob->num(); + CHECK(dim); + for (int i = 0; i < blob->num(); ++i) { + Dtype sum = 0; + for (int j = 0; j < dim; ++j) { + sum += data[i * dim + j]; + } + for (int j = 0; j < dim; ++j) { + data[i * dim + j] /= sum; + } } + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; } - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } }; /** @@ -142,28 +150,29 @@ class PositiveUnitballFiller : public Filler { * TODO(dox): make notation in above comment consistent with rest & use LaTeX. */ template -class XavierFiller : public Filler { - public: - explicit XavierFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - CHECK(blob->count()); - int fan_in = blob->count() / blob->num(); - int fan_out = blob->count() / blob->channels(); - Dtype n = fan_in; // default to fan_in - if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_AVERAGE) { - n = (fan_in + fan_out) / Dtype(2); - } else if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_FAN_OUT) { - n = fan_out; +class XavierFiller: public Filler { + public: + explicit XavierFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK(blob->count()); + int fan_in = blob->count() / blob->num(); + int fan_out = blob->count() / blob->channels(); + Dtype n = fan_in; // default to fan_in + if (this->filler_param_.variance_norm() + == FillerParameter_VarianceNorm_AVERAGE) { + n = (fan_in + fan_out) / Dtype(2); + } else if (this->filler_param_.variance_norm() + == FillerParameter_VarianceNorm_FAN_OUT) { + n = fan_out; + } + Dtype scale = sqrt(Dtype(3) / n); + caffe_rng_uniform(blob->count(), -scale, scale, + blob->mutable_cpu_data()); + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; } - Dtype scale = sqrt(Dtype(3) / n); - caffe_rng_uniform(blob->count(), -scale, scale, - blob->mutable_cpu_data()); - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } }; /** @@ -184,82 +193,84 @@ class XavierFiller : public Filler { * is currently not the case for inner product layers. */ template -class MSRAFiller : public Filler { - public: - explicit MSRAFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - CHECK(blob->count()); - int fan_in = blob->count() / blob->num(); - int fan_out = blob->count() / blob->channels(); - Dtype n = fan_in; // default to fan_in - if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_AVERAGE) { - n = (fan_in + fan_out) / Dtype(2); - } else if (this->filler_param_.variance_norm() == - FillerParameter_VarianceNorm_FAN_OUT) { - n = fan_out; +class MSRAFiller: public Filler { + public: + explicit MSRAFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK(blob->count()); + int fan_in = blob->count() / blob->num(); + int fan_out = blob->count() / blob->channels(); + Dtype n = fan_in; // default to fan_in + if (this->filler_param_.variance_norm() + == FillerParameter_VarianceNorm_AVERAGE) { + n = (fan_in + fan_out) / Dtype(2); + } else if (this->filler_param_.variance_norm() + == FillerParameter_VarianceNorm_FAN_OUT) { + n = fan_out; + } + Dtype std = sqrt(Dtype(2) / n); + caffe_rng_gaussian(blob->count(), Dtype(0), std, + blob->mutable_cpu_data()); + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; } - Dtype std = sqrt(Dtype(2) / n); - caffe_rng_gaussian(blob->count(), Dtype(0), std, - blob->mutable_cpu_data()); - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } }; /*! -@brief Fills a Blob with coefficients for bilinear interpolation. + @brief Fills a Blob with coefficients for bilinear interpolation. -A common use case is with the DeconvolutionLayer acting as upsampling. -You can upsample a feature map with shape of (B, C, H, W) by any integer factor -using the following proto. -\code -layer { - name: "upsample", type: "Deconvolution" - bottom: "{{bottom_name}}" top: "{{top_name}}" - convolution_param { - kernel_size: {{2 * factor - factor % 2}} stride: {{factor}} - num_output: {{C}} group: {{C}} - pad: {{ceil((factor - 1) / 2.)}} - weight_filler: { type: "bilinear" } bias_term: false - } - param { lr_mult: 0 decay_mult: 0 } -} -\endcode -Please use this by replacing `{{}}` with your values. By specifying -`num_output: {{C}} group: {{C}}`, it behaves as -channel-wise convolution. The filter shape of this deconvolution layer will be -(C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K) -interpolation kernel for every channel of the filter identically. The resulting -shape of the top feature map will be (B, C, factor * H, factor * W). -Note that the learning rate and the -weight decay are set to 0 in order to keep coefficient values of bilinear -interpolation unchanged during training. If you apply this to an image, this -operation is equivalent to the following call in Python with Scikit.Image. -\code{.py} -out = skimage.transform.rescale(img, factor, mode='constant', cval=0) -\endcode + A common use case is with the DeconvolutionLayer acting as upsampling. + You can upsample a feature map with shape of (B, C, H, W) by any integer factor + using the following proto. + \code + layer { + name: "upsample", type: "Deconvolution" + bottom: "{{bottom_name}}" top: "{{top_name}}" + convolution_param { + kernel_size: {{2 * factor - factor % 2}} stride: {{factor}} + num_output: {{C}} group: {{C}} + pad: {{ceil((factor - 1) / 2.)}} + weight_filler: { type: "bilinear" } bias_term: false + } + param { lr_mult: 0 decay_mult: 0 } + } + \endcode + Please use this by replacing `{{}}` with your values. By specifying + `num_output: {{C}} group: {{C}}`, it behaves as + channel-wise convolution. The filter shape of this deconvolution layer will be + (C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K) + interpolation kernel for every channel of the filter identically. The resulting + shape of the top feature map will be (B, C, factor * H, factor * W). + Note that the learning rate and the + weight decay are set to 0 in order to keep coefficient values of bilinear + interpolation unchanged during training. If you apply this to an image, this + operation is equivalent to the following call in Python with Scikit.Image. + \code{.py} + out = skimage.transform.rescale(img, factor, mode='constant', cval=0) + \endcode */ template -class BilinearFiller : public Filler { - public: - explicit BilinearFiller(const FillerParameter& param) - : Filler(param) {} - virtual void Fill(Blob* blob) { - CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim."; - CHECK_EQ(blob->width(), blob->height()) << "Filter must be square"; - Dtype* data = blob->mutable_cpu_data(); - int f = ceil(blob->width() / 2.); - float c = (2 * f - 1 - f % 2) / (2. * f); - for (int i = 0; i < blob->count(); ++i) { - float x = i % blob->width(); - float y = (i / blob->width()) % blob->height(); - data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c)); +class BilinearFiller: public Filler { + public: + explicit BilinearFiller(const FillerParameter& param) + : Filler(param) { + } + virtual void Fill(Blob* blob) { + CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim."; + CHECK_EQ(blob->width(), blob->height()) << "Filter must be square"; + Dtype* data = blob->mutable_cpu_data(); + int f = ceil(blob->width() / 2.); + float c = (2 * f - 1 - f % 2) / (2. * f); + for (int i = 0; i < blob->count(); ++i) { + float x = i % blob->width(); + float y = (i / blob->width()) % blob->height(); + data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c)); + } + CHECK_EQ(this->filler_param_.sparse(), -1) + << "Sparsity not supported by this Filler."; } - CHECK_EQ(this->filler_param_.sparse(), -1) - << "Sparsity not supported by this Filler."; - } }; /** @@ -288,7 +299,7 @@ Filler* GetFiller(const FillerParameter& param) { } else { CHECK(false) << "Unknown filler name: " << param.type(); } - return (Filler*)(NULL); + return (Filler*) (NULL); } } // namespace caffe diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp index 815ca546..dd8ae8bf 100644 --- a/include/caffe/internal_thread.hpp +++ b/include/caffe/internal_thread.hpp @@ -7,7 +7,9 @@ Forward declare boost::thread instead of including boost/thread.hpp to avoid a boost/NVCC issues (#1009, #1010) on OSX. */ -namespace boost { class thread; } +namespace boost { +class thread; +} namespace caffe { @@ -17,24 +19,27 @@ namespace caffe { * by reimplementing the virutal function InternalThreadEntry. */ class InternalThread { - public: - InternalThread() : thread_() {} - virtual ~InternalThread(); + public: + InternalThread() + : thread_() { + } + virtual ~InternalThread(); - /** Returns true if the thread was successfully started. **/ - bool StartInternalThread(); + /** Returns true if the thread was successfully started. **/ + bool StartInternalThread(); - /** Will not return until the internal thread has exited. */ - bool WaitForInternalThreadToExit(); + /** Will not return until the internal thread has exited. */ + bool WaitForInternalThreadToExit(); - bool is_started() const; + bool is_started() const; - protected: - /* Implement this method in your subclass - with the code you want your thread to run. */ - virtual void InternalThreadEntry() {} + protected: + /* Implement this method in your subclass + with the code you want your thread to run. */ + virtual void InternalThreadEntry() { + } - shared_ptr thread_; + shared_ptr thread_; }; } // namespace caffe diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp index e2eba196..c346ede1 100644 --- a/include/caffe/layer.hpp +++ b/include/caffe/layer.hpp @@ -25,14 +25,14 @@ namespace caffe { */ template class Layer { - public: - /** - * You should not implement your own constructor. Any set up code should go - * to SetUp(), where the dimensions of the bottom blobs are provided to the - * layer. - */ - explicit Layer(const LayerParameter& param) - : layer_param_(param) { + public: + /** + * You should not implement your own constructor. Any set up code should go + * to SetUp(), where the dimensions of the bottom blobs are provided to the + * layer. + */ + explicit Layer(const LayerParameter& param) + : layer_param_(param) { // Set phase and copy blobs (if there are any). phase_ = param.phase(); if (layer_param_.blobs_size() > 0) { @@ -43,361 +43,384 @@ class Layer { } } } - virtual ~Layer() {} - - /** - * @brief Implements common layer setup functionality. - * - * @param bottom the preshaped input blobs - * @param top - * the allocated but unshaped output blobs, to be shaped by Reshape - * - * Checks that the number of bottom and top blobs is correct. - * Calls LayerSetUp to do special layer setup for individual layer types, - * followed by Reshape to set up sizes of top blobs and internal buffers. - * Sets up the loss weight multiplier blobs for any non-zero loss weights. - * This method may not be overridden. - */ - void SetUp(const vector*>& bottom, - const vector*>& top) { - CheckBlobCounts(bottom, top); - LayerSetUp(bottom, top); - Reshape(bottom, top); - SetLossWeights(top); - } - - /** - * @brief Does layer-specific setup: your layer should implement this function - * as well as Reshape. - * - * @param bottom - * the preshaped input blobs, whose data fields store the input data for - * this layer - * @param top - * the allocated but unshaped output blobs - * - * This method should do one-time layer specific setup. This includes reading - * and processing relevent parameters from the layer_param_. - * Setting up the shapes of top blobs and internal buffers should be done in - * Reshape, which will be called before the forward pass to - * adjust the top blob sizes. - */ - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top) {} - - /** - * @brief Adjust the shapes of top blobs and internal buffers to accomodate - * the shapes of the bottom blobs. - * - * @param bottom the input blobs, with the requested input shapes - * @param top the top blobs, which should be reshaped as needed - * - * This method should reshape top blobs as needed according to the shapes - * of the bottom (input) blobs, as well as reshaping any internal buffers - * and making any other necessary adjustments so that the layer can - * accomodate the bottom blobs. - */ - virtual void Reshape(const vector*>& bottom, - const vector*>& top) = 0; - - /** - * @brief Given the bottom blobs, compute the top blobs and the loss. - * - * @param bottom - * the input blobs, whose data fields store the input data for this layer - * @param top - * the preshaped output blobs, whose data fields will store this layers' - * outputs - * \return The total loss from the layer. - * - * The Forward wrapper calls the relevant device wrapper function - * (Forward_cpu or Forward_gpu) to compute the top blob values given the - * bottom blobs. If the layer has any non-zero loss_weights, the wrapper - * then computes and returns the loss. - * - * Your layer should implement Forward_cpu and (optionally) Forward_gpu. - */ - inline Dtype Forward(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Given the top blob error gradients, compute the bottom blob error - * gradients. - * - * @param top - * the output blobs, whose diff fields store the gradient of the error - * with respect to themselves - * @param propagate_down - * a vector with equal length to bottom, with each index indicating - * whether to propagate the error gradients down to the bottom blob at - * the corresponding index - * @param bottom - * the input blobs, whose diff fields will store the gradient of the error - * with respect to themselves after Backward is run - * - * The Backward wrapper calls the relevant device wrapper function - * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the - * top blob diffs. - * - * Your layer should implement Backward_cpu and (optionally) Backward_gpu. - */ - inline void Backward(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom); - - /** - * @brief Returns the vector of learnable parameter blobs. - */ - vector > >& blobs() { - return blobs_; - } - - /** - * @brief Returns the layer parameter. - */ - const LayerParameter& layer_param() const { return layer_param_; } - - /** - * @brief Writes the layer parameter to a protocol buffer - */ - virtual void ToProto(LayerParameter* param, bool write_diff = false); - - /** - * @brief Returns the scalar loss associated with a top blob at a given index. - */ - inline Dtype loss(const int top_index) const { - return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0); - } - - /** - * @brief Sets the loss associated with a top blob at a given index. - */ - inline void set_loss(const int top_index, const Dtype value) { - if (loss_.size() <= top_index) { - loss_.resize(top_index + 1, Dtype(0)); + virtual ~Layer() { } - loss_[top_index] = value; - } - /** - * @brief Returns the layer type. - */ - virtual inline const char* type() const { return ""; } - - /** - * @brief Returns the exact number of bottom blobs required by the layer, - * or -1 if no exact number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some exact number of bottom blobs. - */ - virtual inline int ExactNumBottomBlobs() const { return -1; } - /** - * @brief Returns the minimum number of bottom blobs required by the layer, - * or -1 if no minimum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some minimum number of bottom blobs. - */ - virtual inline int MinBottomBlobs() const { return -1; } - /** - * @brief Returns the maximum number of bottom blobs required by the layer, - * or -1 if no maximum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some maximum number of bottom blobs. - */ - virtual inline int MaxBottomBlobs() const { return -1; } - /** - * @brief Returns the exact number of top blobs required by the layer, - * or -1 if no exact number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some exact number of top blobs. - */ - virtual inline int ExactNumTopBlobs() const { return -1; } - /** - * @brief Returns the minimum number of top blobs required by the layer, - * or -1 if no minimum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some minimum number of top blobs. - */ - virtual inline int MinTopBlobs() const { return -1; } - /** - * @brief Returns the maximum number of top blobs required by the layer, - * or -1 if no maximum number is required. - * - * This method should be overridden to return a non-negative value if your - * layer expects some maximum number of top blobs. - */ - virtual inline int MaxTopBlobs() const { return -1; } - /** - * @brief Returns true if the layer requires an equal number of bottom and - * top blobs. - * - * This method should be overridden to return true if your layer expects an - * equal number of bottom and top blobs. - */ - virtual inline bool EqualNumBottomTopBlobs() const { return false; } + /** + * @brief Implements common layer setup functionality. + * + * @param bottom the preshaped input blobs + * @param top + * the allocated but unshaped output blobs, to be shaped by Reshape + * + * Checks that the number of bottom and top blobs is correct. + * Calls LayerSetUp to do special layer setup for individual layer types, + * followed by Reshape to set up sizes of top blobs and internal buffers. + * Sets up the loss weight multiplier blobs for any non-zero loss weights. + * This method may not be overridden. + */ + void SetUp(const vector*>& bottom, + const vector*>& top) { + CheckBlobCounts(bottom, top); + LayerSetUp(bottom, top); + Reshape(bottom, top); + SetLossWeights(top); + } - /** - * @brief Return whether "anonymous" top blobs are created automatically - * by the layer. - * - * If this method returns true, Net::Init will create enough "anonymous" top - * blobs to fulfill the requirement specified by ExactNumTopBlobs() or - * MinTopBlobs(). - */ - virtual inline bool AutoTopBlobs() const { return false; } + /** + * @brief Does layer-specific setup: your layer should implement this function + * as well as Reshape. + * + * @param bottom + * the preshaped input blobs, whose data fields store the input data for + * this layer + * @param top + * the allocated but unshaped output blobs + * + * This method should do one-time layer specific setup. This includes reading + * and processing relevent parameters from the layer_param_. + * Setting up the shapes of top blobs and internal buffers should be done in + * Reshape, which will be called before the forward pass to + * adjust the top blob sizes. + */ + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top) { + } - /** - * @brief Return whether to allow force_backward for a given bottom blob - * index. - * - * If AllowForceBackward(i) == false, we will ignore the force_backward - * setting and backpropagate to blob i only if it needs gradient information - * (as is done when force_backward == false). - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return true; - } + /** + * @brief Adjust the shapes of top blobs and internal buffers to accomodate + * the shapes of the bottom blobs. + * + * @param bottom the input blobs, with the requested input shapes + * @param top the top blobs, which should be reshaped as needed + * + * This method should reshape top blobs as needed according to the shapes + * of the bottom (input) blobs, as well as reshaping any internal buffers + * and making any other necessary adjustments so that the layer can + * accomodate the bottom blobs. + */ + virtual void Reshape(const vector*>& bottom, + const vector*>& top) = 0; + + /** + * @brief Given the bottom blobs, compute the top blobs and the loss. + * + * @param bottom + * the input blobs, whose data fields store the input data for this layer + * @param top + * the preshaped output blobs, whose data fields will store this layers' + * outputs + * \return The total loss from the layer. + * + * The Forward wrapper calls the relevant device wrapper function + * (Forward_cpu or Forward_gpu) to compute the top blob values given the + * bottom blobs. If the layer has any non-zero loss_weights, the wrapper + * then computes and returns the loss. + * + * Your layer should implement Forward_cpu and (optionally) Forward_gpu. + */ + inline Dtype Forward(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Given the top blob error gradients, compute the bottom blob error + * gradients. + * + * @param top + * the output blobs, whose diff fields store the gradient of the error + * with respect to themselves + * @param propagate_down + * a vector with equal length to bottom, with each index indicating + * whether to propagate the error gradients down to the bottom blob at + * the corresponding index + * @param bottom + * the input blobs, whose diff fields will store the gradient of the error + * with respect to themselves after Backward is run + * + * The Backward wrapper calls the relevant device wrapper function + * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the + * top blob diffs. + * + * Your layer should implement Backward_cpu and (optionally) Backward_gpu. + */ + inline void Backward(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /** + * @brief Returns the vector of learnable parameter blobs. + */ + vector > >& blobs() { + return blobs_; + } - /** - * @brief Specifies whether the layer should compute gradients w.r.t. a - * parameter at a particular index given by param_id. - * - * You can safely ignore false values and always compute gradients - * for all parameters, but possibly with wasteful computation. - */ - inline bool param_propagate_down(const int param_id) { - return (param_propagate_down_.size() > param_id) ? - param_propagate_down_[param_id] : false; - } - /** - * @brief Sets whether the layer should compute gradients w.r.t. a - * parameter at a particular index given by param_id. - */ - inline void set_param_propagate_down(const int param_id, const bool value) { - if (param_propagate_down_.size() <= param_id) { - param_propagate_down_.resize(param_id + 1, true); + /** + * @brief Returns the layer parameter. + */ + const LayerParameter& layer_param() const { + return layer_param_; } - param_propagate_down_[param_id] = value; - } + /** + * @brief Writes the layer parameter to a protocol buffer + */ + virtual void ToProto(LayerParameter* param, bool write_diff = false); - protected: - /** The protobuf that stores the layer parameters */ - LayerParameter layer_param_; - /** The phase: TRAIN or TEST */ - Phase phase_; - /** The vector that stores the learnable parameters as a set of blobs. */ - vector > > blobs_; - /** Vector indicating whether to compute the diff of each param blob. */ - vector param_propagate_down_; + /** + * @brief Returns the scalar loss associated with a top blob at a given index. + */ + inline Dtype loss(const int top_index) const { + return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0); + } - /** The vector that indicates whether each top blob has a non-zero weight in - * the objective function. */ - vector loss_; + /** + * @brief Sets the loss associated with a top blob at a given index. + */ + inline void set_loss(const int top_index, const Dtype value) { + if (loss_.size() <= top_index) { + loss_.resize(top_index + 1, Dtype(0)); + } + loss_[top_index] = value; + } - /** @brief Using the CPU device, compute the layer output. */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) = 0; - /** - * @brief Using the GPU device, compute the layer output. - * Fall back to Forward_cpu() if unavailable. - */ - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top) { - // LOG(WARNING) << "Using CPU code as backup."; - return Forward_cpu(bottom, top); - } + /** + * @brief Returns the layer type. + */ + virtual inline const char* type() const { + return ""; + } - /** - * @brief Using the CPU device, compute the gradients for any parameters and - * for the bottom blobs if propagate_down is true. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) = 0; - /** - * @brief Using the GPU device, compute the gradients for any parameters and - * for the bottom blobs if propagate_down is true. - * Fall back to Backward_cpu() if unavailable. - */ - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - // LOG(WARNING) << "Using CPU code as backup."; - Backward_cpu(top, propagate_down, bottom); - } + /** + * @brief Returns the exact number of bottom blobs required by the layer, + * or -1 if no exact number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some exact number of bottom blobs. + */ + virtual inline int ExactNumBottomBlobs() const { + return -1; + } + /** + * @brief Returns the minimum number of bottom blobs required by the layer, + * or -1 if no minimum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some minimum number of bottom blobs. + */ + virtual inline int MinBottomBlobs() const { + return -1; + } + /** + * @brief Returns the maximum number of bottom blobs required by the layer, + * or -1 if no maximum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some maximum number of bottom blobs. + */ + virtual inline int MaxBottomBlobs() const { + return -1; + } + /** + * @brief Returns the exact number of top blobs required by the layer, + * or -1 if no exact number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some exact number of top blobs. + */ + virtual inline int ExactNumTopBlobs() const { + return -1; + } + /** + * @brief Returns the minimum number of top blobs required by the layer, + * or -1 if no minimum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some minimum number of top blobs. + */ + virtual inline int MinTopBlobs() const { + return -1; + } + /** + * @brief Returns the maximum number of top blobs required by the layer, + * or -1 if no maximum number is required. + * + * This method should be overridden to return a non-negative value if your + * layer expects some maximum number of top blobs. + */ + virtual inline int MaxTopBlobs() const { + return -1; + } + /** + * @brief Returns true if the layer requires an equal number of bottom and + * top blobs. + * + * This method should be overridden to return true if your layer expects an + * equal number of bottom and top blobs. + */ + virtual inline bool EqualNumBottomTopBlobs() const { + return false; + } - /** - * Called by the parent Layer's SetUp to check that the number of bottom - * and top Blobs provided as input match the expected numbers specified by - * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions. - */ - virtual void CheckBlobCounts(const vector*>& bottom, - const vector*>& top) { - if (ExactNumBottomBlobs() >= 0) { - CHECK_EQ(ExactNumBottomBlobs(), bottom.size()) - << type() << " Layer takes " << ExactNumBottomBlobs() - << " bottom blob(s) as input."; + /** + * @brief Return whether "anonymous" top blobs are created automatically + * by the layer. + * + * If this method returns true, Net::Init will create enough "anonymous" top + * blobs to fulfill the requirement specified by ExactNumTopBlobs() or + * MinTopBlobs(). + */ + virtual inline bool AutoTopBlobs() const { + return false; } - if (MinBottomBlobs() >= 0) { - CHECK_LE(MinBottomBlobs(), bottom.size()) - << type() << " Layer takes at least " << MinBottomBlobs() - << " bottom blob(s) as input."; + + /** + * @brief Return whether to allow force_backward for a given bottom blob + * index. + * + * If AllowForceBackward(i) == false, we will ignore the force_backward + * setting and backpropagate to blob i only if it needs gradient information + * (as is done when force_backward == false). + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return true; } - if (MaxBottomBlobs() >= 0) { - CHECK_GE(MaxBottomBlobs(), bottom.size()) - << type() << " Layer takes at most " << MaxBottomBlobs() - << " bottom blob(s) as input."; + + /** + * @brief Specifies whether the layer should compute gradients w.r.t. a + * parameter at a particular index given by param_id. + * + * You can safely ignore false values and always compute gradients + * for all parameters, but possibly with wasteful computation. + */ + inline bool param_propagate_down(const int param_id) { + return + (param_propagate_down_.size() > param_id) ? + param_propagate_down_[param_id] : false; } - if (ExactNumTopBlobs() >= 0) { - CHECK_EQ(ExactNumTopBlobs(), top.size()) - << type() << " Layer produces " << ExactNumTopBlobs() - << " top blob(s) as output."; + /** + * @brief Sets whether the layer should compute gradients w.r.t. a + * parameter at a particular index given by param_id. + */ + inline void set_param_propagate_down(const int param_id, const bool value) { + if (param_propagate_down_.size() <= param_id) { + param_propagate_down_.resize(param_id + 1, true); + } + param_propagate_down_[param_id] = value; } - if (MinTopBlobs() >= 0) { - CHECK_LE(MinTopBlobs(), top.size()) - << type() << " Layer produces at least " << MinTopBlobs() - << " top blob(s) as output."; + + protected: + /** The protobuf that stores the layer parameters */ + LayerParameter layer_param_; + /** The phase: TRAIN or TEST */ + Phase phase_; + /** The vector that stores the learnable parameters as a set of blobs. */ + vector > > blobs_; + /** Vector indicating whether to compute the diff of each param blob. */ + vector param_propagate_down_; + + /** The vector that indicates whether each top blob has a non-zero weight in + * the objective function. */ + vector loss_; + + /** @brief Using the CPU device, compute the layer output. */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) = 0; + /** + * @brief Using the GPU device, compute the layer output. + * Fall back to Forward_cpu() if unavailable. + */ + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top) { + // LOG(WARNING) << "Using CPU code as backup."; + return Forward_cpu(bottom, top); } - if (MaxTopBlobs() >= 0) { - CHECK_GE(MaxTopBlobs(), top.size()) - << type() << " Layer produces at most " << MaxTopBlobs() - << " top blob(s) as output."; + + /** + * @brief Using the CPU device, compute the gradients for any parameters and + * for the bottom blobs if propagate_down is true. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) = 0; + /** + * @brief Using the GPU device, compute the gradients for any parameters and + * for the bottom blobs if propagate_down is true. + * Fall back to Backward_cpu() if unavailable. + */ + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + // LOG(WARNING) << "Using CPU code as backup."; + Backward_cpu(top, propagate_down, bottom); } - if (EqualNumBottomTopBlobs()) { - CHECK_EQ(bottom.size(), top.size()) - << type() << " Layer produces one top blob as output for each " - << "bottom blob input."; + + /** + * Called by the parent Layer's SetUp to check that the number of bottom + * and top Blobs provided as input match the expected numbers specified by + * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions. + */ + virtual void CheckBlobCounts(const vector*>& bottom, + const vector*>& top) { + if (ExactNumBottomBlobs() >= 0) { + CHECK_EQ(ExactNumBottomBlobs(), bottom.size()) << type() + << " Layer takes " << ExactNumBottomBlobs() + << " bottom blob(s) as input."; + } + if (MinBottomBlobs() >= 0) { + CHECK_LE(MinBottomBlobs(), bottom.size()) << type() + << " Layer takes at least " << MinBottomBlobs() + << " bottom blob(s) as input."; + } + if (MaxBottomBlobs() >= 0) { + CHECK_GE(MaxBottomBlobs(), bottom.size()) << type() + << " Layer takes at most " << MaxBottomBlobs() + << " bottom blob(s) as input."; + } + if (ExactNumTopBlobs() >= 0) { + CHECK_EQ(ExactNumTopBlobs(), top.size()) << type() << " Layer produces " + << ExactNumTopBlobs() << " top blob(s) as output."; + } + if (MinTopBlobs() >= 0) { + CHECK_LE(MinTopBlobs(), top.size()) << type() + << " Layer produces at least " << MinTopBlobs() + << " top blob(s) as output."; + } + if (MaxTopBlobs() >= 0) { + CHECK_GE(MaxTopBlobs(), top.size()) << type() + << " Layer produces at most " << MaxTopBlobs() + << " top blob(s) as output."; + } + if (EqualNumBottomTopBlobs()) { + CHECK_EQ(bottom.size(), top.size()) << type() + << " Layer produces one top blob as output for each " + << "bottom blob input."; + } } - } - /** - * Called by SetUp to initialize the weights associated with any top blobs in - * the loss function. Store non-zero loss weights in the diff blob. - */ - inline void SetLossWeights(const vector*>& top) { - const int num_loss_weights = layer_param_.loss_weight_size(); - if (num_loss_weights) { - CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be " - "unspecified or specified once per top blob."; - for (int top_id = 0; top_id < top.size(); ++top_id) { - const Dtype loss_weight = layer_param_.loss_weight(top_id); - if (loss_weight == Dtype(0)) { continue; } - this->set_loss(top_id, loss_weight); - const int count = top[top_id]->count(); - Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff(); - caffe_set(count, loss_weight, loss_multiplier); + /** + * Called by SetUp to initialize the weights associated with any top blobs in + * the loss function. Store non-zero loss weights in the diff blob. + */ + inline void SetLossWeights(const vector*>& top) { + const int num_loss_weights = layer_param_.loss_weight_size(); + if (num_loss_weights) { + CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be " + "unspecified or specified once per top blob."; + for (int top_id = 0; top_id < top.size(); ++top_id) { + const Dtype loss_weight = layer_param_.loss_weight(top_id); + if (loss_weight == Dtype(0)) { + continue; + } + this->set_loss(top_id, loss_weight); + const int count = top[top_id]->count(); + Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff(); + caffe_set(count, loss_weight, loss_multiplier); + } } } - } - DISABLE_COPY_AND_ASSIGN(Layer); -}; // class Layer + DISABLE_COPY_AND_ASSIGN (Layer); +}; +// class Layer // Forward and backward wrappers. You should implement the cpu and // gpu specific implementations instead, and should not change these @@ -411,7 +434,9 @@ inline Dtype Layer::Forward(const vector*>& bottom, case Caffe::CPU: Forward_cpu(bottom, top); for (int top_id = 0; top_id < top.size(); ++top_id) { - if (!this->loss(top_id)) { continue; } + if (!this->loss(top_id)) { + continue; + } const int count = top[top_id]->count(); const Dtype* data = top[top_id]->cpu_data(); const Dtype* loss_weights = top[top_id]->cpu_diff(); @@ -422,7 +447,9 @@ inline Dtype Layer::Forward(const vector*>& bottom, Forward_gpu(bottom, top); #ifndef CPU_ONLY for (int top_id = 0; top_id < top.size(); ++top_id) { - if (!this->loss(top_id)) { continue; } + if (!this->loss(top_id)) { + continue; + } const int count = top[top_id]->count(); const Dtype* data = top[top_id]->gpu_data(); const Dtype* loss_weights = top[top_id]->gpu_diff(); @@ -440,8 +467,7 @@ inline Dtype Layer::Forward(const vector*>& bottom, template inline void Layer::Backward(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { switch (Caffe::mode()) { case Caffe::CPU: Backward_cpu(top, propagate_down, bottom); diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp index 2fcd9386..6da8d315 100644 --- a/include/caffe/layer_factory.hpp +++ b/include/caffe/layer_factory.hpp @@ -52,64 +52,63 @@ class Layer; template class LayerRegistry { - public: - typedef shared_ptr > (*Creator)(const LayerParameter&); - typedef std::map CreatorRegistry; - - static CreatorRegistry& Registry() { - static CreatorRegistry* g_registry_ = new CreatorRegistry(); - return *g_registry_; - } - - // Adds a creator. - static void AddCreator(const string& type, Creator creator) { - CreatorRegistry& registry = Registry(); - CHECK_EQ(registry.count(type), 0) - << "Layer type " << type << " already registered."; - registry[type] = creator; - } - - // Get a layer using a LayerParameter. - static shared_ptr > CreateLayer(const LayerParameter& param) { - LOG(INFO) << "Creating layer " << param.name(); - const string& type = param.type(); - CreatorRegistry& registry = Registry(); - CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type - << " (known types: " << LayerTypeList() << ")"; - return registry[type](param); - } - - private: - // Layer registry should never be instantiated - everything is done with its - // static variables. - LayerRegistry() {} - - static string LayerTypeList() { - CreatorRegistry& registry = Registry(); - string layer_types; - for (typename CreatorRegistry::iterator iter = registry.begin(); - iter != registry.end(); ++iter) { - if (iter != registry.begin()) { - layer_types += ", "; + public: + typedef shared_ptr > (*Creator)(const LayerParameter&); + typedef std::map CreatorRegistry; + + static CreatorRegistry& Registry() { + static CreatorRegistry* g_registry_ = new CreatorRegistry(); + return *g_registry_; + } + + // Adds a creator. + static void AddCreator(const string& type, Creator creator) { + CreatorRegistry& registry = Registry(); + CHECK_EQ(registry.count(type), 0) << "Layer type " << type + << " already registered."; + registry[type] = creator; + } + + // Get a layer using a LayerParameter. + static shared_ptr > CreateLayer(const LayerParameter& param) { + LOG(INFO) << "Creating layer " << param.name(); + const string& type = param.type(); + CreatorRegistry& registry = Registry(); + CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type + << " (known types: " << LayerTypeList() << ")"; + return registry[type](param); + } + + private: + // Layer registry should never be instantiated - everything is done with its + // static variables. + LayerRegistry() { + } + + static string LayerTypeList() { + CreatorRegistry& registry = Registry(); + string layer_types; + for (typename CreatorRegistry::iterator iter = registry.begin(); + iter != registry.end(); ++iter) { + if (iter != registry.begin()) { + layer_types += ", "; + } + layer_types += iter->first; } - layer_types += iter->first; + return layer_types; } - return layer_types; - } }; - template class LayerRegisterer { - public: - LayerRegisterer(const string& type, - shared_ptr > (*creator)(const LayerParameter&)) { - // LOG(INFO) << "Registering layer type: " << type; - LayerRegistry::AddCreator(type, creator); - } + public: + LayerRegisterer(const string& type, + shared_ptr > (*creator)(const LayerParameter&)) { + // LOG(INFO) << "Registering layer type: " << type; + LayerRegistry::AddCreator(type, creator); + } }; - #define REGISTER_LAYER_CREATOR(type, creator) \ static LayerRegisterer g_creator_f_##type(#type, creator); \ static LayerRegisterer g_creator_d_##type(#type, creator) \ diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp index 86c34241..431bd8ea 100644 --- a/include/caffe/loss_layers.hpp +++ b/include/caffe/loss_layers.hpp @@ -20,72 +20,81 @@ const float kLOG_THRESHOLD = 1e-20; * classification task. */ template -class AccuracyLayer : public Layer { - public: - /** - * @param param provides AccuracyParameter accuracy_param, - * with AccuracyLayer options: - * - top_k (\b optional, default 1). - * Sets the maximum rank @f$ k @f$ at which a prediction is considered - * correct. For example, if @f$ k = 5 @f$, a prediction is counted - * correct if the correct label is among the top 5 predicted labels. - */ - explicit AccuracyLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Accuracy"; } - virtual inline int ExactNumBottomBlobs() const { return 2; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - /** - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ x @f$, a Blob with values in - * @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of - * the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted - * label @f$ \hat{l}_n @f$ given by its maximal index: - * @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels @f$ l @f$, an integer-valued Blob with values - * @f$ l_n \in [0, 1, 2, ..., K - 1] @f$ - * indicating the correct class label among the @f$ K @f$ classes - * @param top output Blob vector (length 1) - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * the computed accuracy: @f$ - * \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \} - * @f$, where @f$ - * \delta\{\mathrm{condition}\} = \left\{ - * \begin{array}{lr} - * 1 & \mbox{if condition} \\ - * 0 & \mbox{otherwise} - * \end{array} \right. - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); +class AccuracyLayer: public Layer { + public: + /** + * @param param provides AccuracyParameter accuracy_param, + * with AccuracyLayer options: + * - top_k (\b optional, default 1). + * Sets the maximum rank @f$ k @f$ at which a prediction is considered + * correct. For example, if @f$ k = 5 @f$, a prediction is counted + * correct if the correct label is among the top 5 predicted labels. + */ + explicit AccuracyLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual inline const char* type() const { + return "Accuracy"; + } + virtual inline int ExactNumBottomBlobs() const { + return 2; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } - /// @brief Not implemented -- AccuracyLayer cannot be used as a loss. - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < propagate_down.size(); ++i) { - if (propagate_down[i]) { NOT_IMPLEMENTED; } + protected: + /** + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ x @f$, a Blob with values in + * @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of + * the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted + * label @f$ \hat{l}_n @f$ given by its maximal index: + * @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels @f$ l @f$, an integer-valued Blob with values + * @f$ l_n \in [0, 1, 2, ..., K - 1] @f$ + * indicating the correct class label among the @f$ K @f$ classes + * @param top output Blob vector (length 1) + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * the computed accuracy: @f$ + * \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \} + * @f$, where @f$ + * \delta\{\mathrm{condition}\} = \left\{ + * \begin{array}{lr} + * 1 & \mbox{if condition} \\ + * 0 & \mbox{otherwise} + * \end{array} \right. + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /// @brief Not implemented -- AccuracyLayer cannot be used as a loss. + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + for (int i = 0; i < propagate_down.size(); ++i) { + if (propagate_down[i]) { + NOT_IMPLEMENTED; + } + } } - } - int label_axis_, outer_num_, inner_num_; + int label_axis_, outer_num_, inner_num_; - int top_k_; + int top_k_; - /// Whether to ignore instances with a certain label. - bool has_ignore_label_; - /// The label indicating that an instance should be ignored. - int ignore_label_; + /// Whether to ignore instances with a certain label. + bool has_ignore_label_; + /// The label indicating that an instance should be ignored. + int ignore_label_; }; /** @@ -97,32 +106,39 @@ class AccuracyLayer : public Layer { * -- the predictions. */ template -class LossLayer : public Layer { - public: - explicit LossLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp( - const vector*>& bottom, const vector*>& top); - virtual void Reshape( - const vector*>& bottom, const vector*>& top); - - virtual inline int ExactNumBottomBlobs() const { return 2; } - - /** - * @brief For convenience and backwards compatibility, instruct the Net to - * automatically allocate a single top Blob for LossLayers, into which - * they output their singleton loss, (even if the user didn't specify - * one in the prototxt, etc.). - */ - virtual inline bool AutoTopBlobs() const { return true; } - virtual inline int ExactNumTopBlobs() const { return 1; } - /** - * We usually cannot backpropagate to the labels; ignore force_backward for - * these inputs. - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return bottom_index != 1; - } +class LossLayer: public Layer { + public: + explicit LossLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline int ExactNumBottomBlobs() const { + return 2; + } + + /** + * @brief For convenience and backwards compatibility, instruct the Net to + * automatically allocate a single top Blob for LossLayers, into which + * they output their singleton loss, (even if the user didn't specify + * one in the prototxt, etc.). + */ + virtual inline bool AutoTopBlobs() const { + return true; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + /** + * We usually cannot backpropagate to the labels; ignore force_backward for + * these inputs. + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return bottom_index != 1; + } }; /** @@ -150,64 +166,69 @@ class LossLayer : public Layer { * This can be used to train siamese networks. */ template -class ContrastiveLossLayer : public LossLayer { - public: - explicit ContrastiveLossLayer(const LayerParameter& param) - : LossLayer(param), diff_() {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline int ExactNumBottomBlobs() const { return 3; } - virtual inline const char* type() const { return "ContrastiveLoss"; } - /** - * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate - * to the first two inputs. - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return bottom_index != 2; - } - - protected: - /// @copydoc ContrastiveLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the Contrastive error gradient w.r.t. the inputs. - * - * Computes the gradients with respect to the two input vectors (bottom[0] and - * bottom[1]), but not the similarity label (bottom[2]). - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times 1 \times 1) @f$ - * the features @f$a@f$; Backward fills their diff with - * gradients if propagate_down[0] - * -# @f$ (N \times C \times 1 \times 1) @f$ - * the features @f$b@f$; Backward fills their diff with gradients if - * propagate_down[1] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob diff_; // cached for backward pass - Blob dist_sq_; // cached for backward pass - Blob diff_sq_; // tmp storage for gpu forward pass - Blob summer_vec_; // tmp storage for gpu forward pass +class ContrastiveLossLayer: public LossLayer { + public: + explicit ContrastiveLossLayer(const LayerParameter& param) + : LossLayer(param), diff_() { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline int ExactNumBottomBlobs() const { + return 3; + } + virtual inline const char* type() const { + return "ContrastiveLoss"; + } + /** + * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate + * to the first two inputs. + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return bottom_index != 2; + } + + protected: + /// @copydoc ContrastiveLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the Contrastive error gradient w.r.t. the inputs. + * + * Computes the gradients with respect to the two input vectors (bottom[0] and + * bottom[1]), but not the similarity label (bottom[2]). + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times 1 \times 1) @f$ + * the features @f$a@f$; Backward fills their diff with + * gradients if propagate_down[0] + * -# @f$ (N \times C \times 1 \times 1) @f$ + * the features @f$b@f$; Backward fills their diff with gradients if + * propagate_down[1] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob diff_; // cached for backward pass + Blob dist_sq_; // cached for backward pass + Blob diff_sq_; // tmp storage for gpu forward pass + Blob summer_vec_; // tmp storage for gpu forward pass }; /** @@ -237,68 +258,71 @@ class ContrastiveLossLayer : public LossLayer { * linear least squares problems! We use it only as an instructive example.) */ template -class EuclideanLossLayer : public LossLayer { - public: - explicit EuclideanLossLayer(const LayerParameter& param) - : LossLayer(param), diff_() {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "EuclideanLoss"; } - /** - * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate - * to both inputs -- override to return true and always allow force_backward. - */ - virtual inline bool AllowForceBackward(const int bottom_index) const { - return true; - } - - protected: - /// @copydoc EuclideanLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the Euclidean error gradient w.r.t. the inputs. - * - * Unlike other children of LossLayer, EuclideanLossLayer \b can compute - * gradients with respect to the label inputs bottom[1] (but still only will - * if propagate_down[1] is set, due to being produced by learnable parameters - * or if force_backward is set). In fact, this layer is "commutative" -- the - * result is the same regardless of the order of the two bottoms. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$\hat{y}@f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial \hat{y}} = - * \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n) - * @f$ if propagate_down[0] - * -# @f$ (N \times C \times H \times W) @f$ - * the targets @f$y@f$; Backward fills their diff with gradients - * @f$ \frac{\partial E}{\partial y} = - * \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n) - * @f$ if propagate_down[1] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob diff_; +class EuclideanLossLayer: public LossLayer { + public: + explicit EuclideanLossLayer(const LayerParameter& param) + : LossLayer(param), diff_() { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "EuclideanLoss"; + } + /** + * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate + * to both inputs -- override to return true and always allow force_backward. + */ + virtual inline bool AllowForceBackward(const int bottom_index) const { + return true; + } + + protected: + /// @copydoc EuclideanLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the Euclidean error gradient w.r.t. the inputs. + * + * Unlike other children of LossLayer, EuclideanLossLayer \b can compute + * gradients with respect to the label inputs bottom[1] (but still only will + * if propagate_down[1] is set, due to being produced by learnable parameters + * or if force_backward is set). In fact, this layer is "commutative" -- the + * result is the same regardless of the order of the two bottoms. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$\hat{y}@f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial \hat{y}} = + * \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n) + * @f$ if propagate_down[0] + * -# @f$ (N \times C \times H \times W) @f$ + * the targets @f$y@f$; Backward fills their diff with gradients + * @f$ \frac{\partial E}{\partial y} = + * \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n) + * @f$ if propagate_down[1] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob diff_; }; /** @@ -345,47 +369,50 @@ class EuclideanLossLayer : public LossLayer { * HingeLossLayer). */ template -class HingeLossLayer : public LossLayer { - public: - explicit HingeLossLayer(const LayerParameter& param) - : LossLayer(param) {} - - virtual inline const char* type() const { return "HingeLoss"; } - - protected: - /// @copydoc HingeLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the hinge loss error gradient w.r.t. the predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$t@f$; Backward computes diff - * @f$ \frac{\partial E}{\partial t} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +class HingeLossLayer: public LossLayer { + public: + explicit HingeLossLayer(const LayerParameter& param) + : LossLayer(param) { + } + + virtual inline const char* type() const { + return "HingeLoss"; + } + + protected: + /// @copydoc HingeLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the hinge loss error gradient w.r.t. the predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$t@f$; Backward computes diff + * @f$ \frac{\partial E}{\partial t} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -421,65 +448,74 @@ class HingeLossLayer : public LossLayer { * @f$, where @f$ H_{l_n} @f$ denotes row @f$l_n@f$ of @f$H@f$. */ template -class InfogainLossLayer : public LossLayer { - public: - explicit InfogainLossLayer(const LayerParameter& param) - : LossLayer(param), infogain_() {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should - // be the infogain matrix. (Otherwise the infogain matrix is loaded from a - // file specified by LayerParameter.) - virtual inline int ExactNumBottomBlobs() const { return -1; } - virtual inline int MinBottomBlobs() const { return 2; } - virtual inline int MaxBottomBlobs() const { return 3; } - - virtual inline const char* type() const { return "InfogainLoss"; } - - protected: - /// @copydoc InfogainLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the infogain loss error gradient w.r.t. the predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. (The same applies to the infogain matrix, if - * provided as bottom[2] rather than in the layer_param.) - * - * @param top output Blob vector (length 1), providing the error gradient - * with respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels (similarly for propagate_down[2] and the - * infogain matrix, if provided as bottom[2]) - * @param bottom input Blob vector (length 2-3) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ \hat{p} @f$; Backward computes diff - * @f$ \frac{\partial E}{\partial \hat{p}} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - * -# @f$ (1 \times 1 \times K \times K) @f$ - * (\b optional) the information gain matrix -- ignored as its error - * gradient computation is not implemented. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Blob infogain_; +class InfogainLossLayer: public LossLayer { + public: + explicit InfogainLossLayer(const LayerParameter& param) + : LossLayer(param), infogain_() { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should + // be the infogain matrix. (Otherwise the infogain matrix is loaded from a + // file specified by LayerParameter.) + virtual inline int ExactNumBottomBlobs() const { + return -1; + } + virtual inline int MinBottomBlobs() const { + return 2; + } + virtual inline int MaxBottomBlobs() const { + return 3; + } + + virtual inline const char* type() const { + return "InfogainLoss"; + } + + protected: + /// @copydoc InfogainLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the infogain loss error gradient w.r.t. the predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. (The same applies to the infogain matrix, if + * provided as bottom[2] rather than in the layer_param.) + * + * @param top output Blob vector (length 1), providing the error gradient + * with respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels (similarly for propagate_down[2] and the + * infogain matrix, if provided as bottom[2]) + * @param bottom input Blob vector (length 2-3) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ \hat{p} @f$; Backward computes diff + * @f$ \frac{\partial E}{\partial \hat{p}} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + * -# @f$ (1 \times 1 \times K \times K) @f$ + * (\b optional) the information gain matrix -- ignored as its error + * gradient computation is not implemented. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Blob infogain_; }; /** @@ -512,50 +548,53 @@ class InfogainLossLayer : public LossLayer { * @f$ */ template -class MultinomialLogisticLossLayer : public LossLayer { - public: - explicit MultinomialLogisticLossLayer(const LayerParameter& param) - : LossLayer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "MultinomialLogisticLoss"; } - - protected: - /// @copydoc MultinomialLogisticLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the multinomial logistic loss error gradient w.r.t. the - * predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ \hat{p} @f$; Backward computes diff - * @f$ \frac{\partial E}{\partial \hat{p}} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +class MultinomialLogisticLossLayer: public LossLayer { + public: + explicit MultinomialLogisticLossLayer(const LayerParameter& param) + : LossLayer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "MultinomialLogisticLoss"; + } + + protected: + /// @copydoc MultinomialLogisticLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the multinomial logistic loss error gradient w.r.t. the + * predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ \hat{p} @f$; Backward computes diff + * @f$ \frac{\partial E}{\partial \hat{p}} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -588,67 +627,69 @@ class MultinomialLogisticLossLayer : public LossLayer { * @f$ */ template -class SigmoidCrossEntropyLossLayer : public LossLayer { - public: - explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param) - : LossLayer(param), - sigmoid_layer_(new SigmoidLayer(param)), - sigmoid_output_(new Blob()) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "SigmoidCrossEntropyLoss"; } - - protected: - /// @copydoc SigmoidCrossEntropyLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the - * predictions. - * - * Gradients cannot be computed with respect to the target inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as gradient computation with respect - * to the targets is not implemented. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$x@f$; Backward computes diff - * @f$ \frac{\partial E}{\partial x} = - * \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n) - * @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// The internal SigmoidLayer used to map predictions to probabilities. - shared_ptr > sigmoid_layer_; - /// sigmoid_output stores the output of the SigmoidLayer. - shared_ptr > sigmoid_output_; - /// bottom vector holder to call the underlying SigmoidLayer::Forward - vector*> sigmoid_bottom_vec_; - /// top vector holder to call the underlying SigmoidLayer::Forward - vector*> sigmoid_top_vec_; +class SigmoidCrossEntropyLossLayer: public LossLayer { + public: + explicit SigmoidCrossEntropyLossLayer(const LayerParameter& param) + : LossLayer(param), sigmoid_layer_( + new SigmoidLayer(param)), sigmoid_output_(new Blob()) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "SigmoidCrossEntropyLoss"; + } + + protected: + /// @copydoc SigmoidCrossEntropyLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the + * predictions. + * + * Gradients cannot be computed with respect to the target inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as gradient computation with respect + * to the targets is not implemented. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$x@f$; Backward computes diff + * @f$ \frac{\partial E}{\partial x} = + * \frac{1}{n} \sum\limits_{n=1}^N (\hat{p}_n - p_n) + * @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// The internal SigmoidLayer used to map predictions to probabilities. + shared_ptr > sigmoid_layer_; + /// sigmoid_output stores the output of the SigmoidLayer. + shared_ptr > sigmoid_output_; + /// bottom vector holder to call the underlying SigmoidLayer::Forward + vector*> sigmoid_bottom_vec_; + /// top vector holder to call the underlying SigmoidLayer::Forward + vector*> sigmoid_top_vec_; }; // Forward declare SoftmaxLayer for use in SoftmaxWithLossLayer. @@ -668,6 +709,7 @@ template class SoftmaxLayer; * -# @f$ (N \times C \times H \times W) @f$ * the predictions @f$ x @f$, a Blob with values in * @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of + ss * the @f$ K = CHW @f$ classes. This layer maps these scores to a * probability distribution over classes using the softmax function * @f$ \hat{p}_{nk} = \exp(x_{nk}) / @@ -683,84 +725,100 @@ template class SoftmaxLayer; * @f$, for softmax output class probabilites @f$ \hat{p} @f$ */ template -class SoftmaxWithLossLayer : public LossLayer { - public: - /** - * @param param provides LossParameter loss_param, with options: - * - ignore_label (optional) - * Specify a label value that should be ignored when computing the loss. - * - normalize (optional, default true) - * If true, the loss is normalized by the number of (nonignored) labels - * present; otherwise the loss is simply summed over spatial locations. - */ - explicit SoftmaxWithLossLayer(const LayerParameter& param) - : LossLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "SoftmaxWithLoss"; } - virtual inline int ExactNumTopBlobs() const { return -1; } - virtual inline int MinTopBlobs() const { return 1; } - virtual inline int MaxTopBlobs() const { return 2; } - - protected: - /// @copydoc SoftmaxWithLossLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - /** - * @brief Computes the softmax loss error gradient w.r.t. the predictions. - * - * Gradients cannot be computed with respect to the label inputs (bottom[1]), - * so this method ignores bottom[1] and requires !propagate_down[1], crashing - * if propagate_down[1] is set. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (1 \times 1 \times 1 \times 1) @f$ - * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, - * as @f$ \lambda @f$ is the coefficient of this layer's output - * @f$\ell_i@f$ in the overall Net loss - * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence - * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. - * (*Assuming that this top Blob is not used as a bottom (input) by any - * other layer of the Net.) - * @param propagate_down see Layer::Backward. - * propagate_down[1] must be false as we can't compute gradients with - * respect to the labels. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the predictions @f$ x @f$; Backward computes diff - * @f$ \frac{\partial E}{\partial x} @f$ - * -# @f$ (N \times 1 \times 1 \times 1) @f$ - * the labels -- ignored as we can't compute their error gradients - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - - /// The internal SoftmaxLayer used to map predictions to a distribution. - shared_ptr > softmax_layer_; - /// prob stores the output probability predictions from the SoftmaxLayer. - Blob prob_; - /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward - vector*> softmax_bottom_vec_; - /// top vector holder used in call to the underlying SoftmaxLayer::Forward - vector*> softmax_top_vec_; - /// Whether to ignore instances with a certain label. - bool has_ignore_label_; - /// The label indicating that an instance should be ignored. - int ignore_label_; - /// Whether to normalize the loss by the total number of values present - /// (otherwise just by the batch size). - bool normalize_; - - int softmax_axis_, outer_num_, inner_num_; +class SoftmaxWithLossLayer: public LossLayer { + public: + /** + * @param param provides LossParameter loss_param, with options: + * - ignore_label (optional) + * Specify a label value that should be ignored when computing the loss. + * - normalize (optional, default true) + * If true, the loss is normalized by the number of (nonignored) labels + * present; otherwise the loss is simply summed over spatial locations. + */ + explicit SoftmaxWithLossLayer(const LayerParameter& param) + : LossLayer(param) { + } + ~SoftmaxWithLossLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "SoftmaxWithLoss"; + } + virtual inline int ExactNumTopBlobs() const { + return -1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + virtual inline int MaxTopBlobs() const { + return 2; + } + + protected: + /// @copydoc SoftmaxWithLossLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + /** + * @brief Computes the softmax loss error gradient w.r.t. the predictions. + * + * Gradients cannot be computed with respect to the label inputs (bottom[1]), + * so this method ignores bottom[1] and requires !propagate_down[1], crashing + * if propagate_down[1] is set. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times 1 \times 1 \times 1) @f$ + * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, + * as @f$ \lambda @f$ is the coefficient of this layer's output + * @f$\ell_i@f$ in the overall Net loss + * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence + * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. + * (*Assuming that this top Blob is not used as a bottom (input) by any + * other layer of the Net.) + * @param propagate_down see Layer::Backward. + * propagate_down[1] must be false as we can't compute gradients with + * respect to the labels. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the predictions @f$ x @f$; Backward computes diff + * @f$ \frac{\partial E}{\partial x} @f$ + * -# @f$ (N \times 1 \times 1 \times 1) @f$ + * the labels -- ignored as we can't compute their error gradients + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + void ocl_setup(); + + /// The internal SoftmaxLayer used to map predictions to a distribution. + shared_ptr > softmax_layer_; + /// prob stores the output probability predictions from the SoftmaxLayer. + Blob prob_; + /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward + vector*> softmax_bottom_vec_; + /// top vector holder used in call to the underlying SoftmaxLayer::Forward + vector*> softmax_top_vec_; + /// Whether to ignore instances with a certain label. + bool has_ignore_label_; + /// The label indicating that an instance should be ignored. + int ignore_label_; + /// Whether to normalize the loss by the total number of values present + /// (otherwise just by the batch size). + bool normalize_; + + int softmax_axis_, outer_num_, inner_num_; + + protected: + cl_kernel diff_kernel, scal_kernel, softmax_kernel; + cl_mem d_loss; + cl_kernel softmax_loss_fp_kernel; + cl_kernel softmax_loss_bp_kernel; }; } // namespace caffe diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp index 5665df1e..bbd61b88 100644 --- a/include/caffe/net.hpp +++ b/include/caffe/net.hpp @@ -22,247 +22,266 @@ namespace caffe { */ template class Net { - public: - explicit Net(const NetParameter& param); - explicit Net(const string& param_file, Phase phase); - virtual ~Net() {} + public: + explicit Net(const NetParameter& param); + explicit Net(const string& param_file, Phase phase); + virtual ~Net() { + } - /// @brief Initialize a network with a NetParameter. - void Init(const NetParameter& param); + /// @brief Initialize a network with a NetParameter. + void Init(const NetParameter& param); - /** - * @brief Run Forward with the input Blob%s already fed separately. - * - * You can get the input blobs using input_blobs(). - */ - const vector*>& ForwardPrefilled(Dtype* loss = NULL); + /** + * @brief Run Forward with the input Blob%s already fed separately. + * + * You can get the input blobs using input_blobs(). + */ + const vector*>& ForwardPrefilled(Dtype* loss = NULL); - /** - * The From and To variants of Forward and Backward operate on the - * (topological) ordering by which the net is specified. For general DAG - * networks, note that (1) computing from one layer to another might entail - * extra computation on unrelated branches, and (2) computation starting in - * the middle may be incorrect if all of the layers of a fan-in are not - * included. - */ - Dtype ForwardFromTo(int start, int end); - Dtype ForwardFrom(int start); - Dtype ForwardTo(int end); - /// @brief Run forward using a set of bottom blobs, and return the result. - const vector*>& Forward(const vector* > & bottom, - Dtype* loss = NULL); - /** - * @brief Run forward using a serialized BlobProtoVector and return the - * result as a serialized BlobProtoVector - */ - string Forward(const string& input_blob_protos, Dtype* loss = NULL); + /** + * The From and To variants of Forward and Backward operate on the + * (topological) ordering by which the net is specified. For general DAG + * networks, note that (1) computing from one layer to another might entail + * extra computation on unrelated branches, and (2) computation starting in + * the middle may be incorrect if all of the layers of a fan-in are not + * included. + */ + Dtype ForwardFromTo(int start, int end); + Dtype ForwardFrom(int start); + Dtype ForwardTo(int end); + /// @brief Run forward using a set of bottom blobs, and return the result. + const vector*>& Forward(const vector*> & bottom, + Dtype* loss = NULL); + /** + * @brief Run forward using a serialized BlobProtoVector and return the + * result as a serialized BlobProtoVector + */ + string Forward(const string& input_blob_protos, Dtype* loss = NULL); - /** - * The network backward should take no input and output, since it solely - * computes the gradient w.r.t the parameters, and the data has already been - * provided during the forward pass. - */ - void Backward(); - void BackwardFromTo(int start, int end); - void BackwardFrom(int start); - void BackwardTo(int end); + /** + * The network backward should take no input and output, since it solely + * computes the gradient w.r.t the parameters, and the data has already been + * provided during the forward pass. + */ + void Backward(); + void BackwardFromTo(int start, int end); + void BackwardFrom(int start); + void BackwardTo(int end); - /** - * @brief Reshape all layers from bottom to top. - * - * This is useful to propagate changes to layer sizes without running - * a forward pass, e.g. to compute output feature size. - */ - void Reshape(); + /** + * @brief Reshape all layers from bottom to top. + * + * This is useful to propagate changes to layer sizes without running + * a forward pass, e.g. to compute output feature size. + */ + void Reshape(); - Dtype ForwardBackward(const vector* > & bottom) { - Dtype loss; - Forward(bottom, &loss); - Backward(); - return loss; - } + Dtype ForwardBackward(const vector*> & bottom) { + Dtype loss; + Forward(bottom, &loss); + Backward(); + return loss; + } - /// @brief Updates the network weights based on the diff values computed. - void Update(); + /// @brief Updates the network weights based on the diff values computed. + void Update(); - /** - * @brief For an already initialized net, implicitly copies (i.e., using no - * additional memory) the pre-trained layers from another Net. - */ - void ShareTrainedLayersWith(const Net* other); - // For an already initialized net, CopyTrainedLayersFrom() copies the already - // trained layers from another net parameter instance. - /** - * @brief For an already initialized net, copies the pre-trained layers from - * another Net. - */ - void CopyTrainedLayersFrom(const NetParameter& param); - void CopyTrainedLayersFrom(const string trained_filename); - /// @brief Writes the net to a proto. - void ToProto(NetParameter* param, bool write_diff = false) const; + /** + * @brief For an already initialized net, implicitly copies (i.e., using no + * additional memory) the pre-trained layers from another Net. + */ + void ShareTrainedLayersWith(const Net* other); + // For an already initialized net, CopyTrainedLayersFrom() copies the already + // trained layers from another net parameter instance. + /** + * @brief For an already initialized net, copies the pre-trained layers from + * another Net. + */ + void CopyTrainedLayersFrom(const NetParameter& param); + void CopyTrainedLayersFrom(const string trained_filename); + /// @brief Writes the net to a proto. + void ToProto(NetParameter* param, bool write_diff = false) const; - /// @brief returns the network name. - inline const string& name() const { return name_; } - /// @brief returns the layer names - inline const vector& layer_names() const { return layer_names_; } - /// @brief returns the blob names - inline const vector& blob_names() const { return blob_names_; } - /// @brief returns the blobs - inline const vector > >& blobs() const { - return blobs_; - } - /// @brief returns the layers - inline const vector > >& layers() const { - return layers_; - } - /// @brief returns the phase: TRAIN or TEST - inline Phase phase() const { return phase_; } - /** - * @brief returns the bottom vecs for each layer -- usually you won't - * need this unless you do per-layer checks such as gradients. - */ - inline const vector*> >& bottom_vecs() const { - return bottom_vecs_; - } - /** - * @brief returns the top vecs for each layer -- usually you won't - * need this unless you do per-layer checks such as gradients. - */ - inline const vector*> >& top_vecs() const { - return top_vecs_; - } - inline const vector >& bottom_need_backward() const { - return bottom_need_backward_; - } - inline const vector& blob_loss_weights() const { - return blob_loss_weights_; - } - inline const vector& layer_need_backward() const { - return layer_need_backward_; - } - /// @brief returns the parameters - inline const vector > >& params() const { - return params_; - } - /// @brief returns the parameter learning rate multipliers - inline const vector& params_lr() const { return params_lr_; } - inline const vector& params_weight_decay() const { - return params_weight_decay_; - } - const map& param_names_index() const { - return param_names_index_; - } - inline const vector& param_owners() const { return param_owners_; } - /// @brief Input and output blob numbers - inline int num_inputs() const { return net_input_blobs_.size(); } - inline int num_outputs() const { return net_output_blobs_.size(); } - inline const vector*>& input_blobs() const { - return net_input_blobs_; - } - inline const vector*>& output_blobs() const { - return net_output_blobs_; - } - inline const vector& input_blob_indices() const { - return net_input_blob_indices_; - } - inline const vector& output_blob_indices() const { - return net_output_blob_indices_; - } - bool has_blob(const string& blob_name) const; - const shared_ptr > blob_by_name(const string& blob_name) const; - bool has_layer(const string& layer_name) const; - const shared_ptr > layer_by_name(const string& layer_name) const; + /// @brief returns the network name. + inline const string& name() const { + return name_; + } + /// @brief returns the layer names + inline const vector& layer_names() const { + return layer_names_; + } + /// @brief returns the blob names + inline const vector& blob_names() const { + return blob_names_; + } + /// @brief returns the blobs + inline const vector > >& blobs() const { + return blobs_; + } + /// @brief returns the layers + inline const vector > >& layers() const { + return layers_; + } + /// @brief returns the phase: TRAIN or TEST + inline Phase phase() const { + return phase_; + } + /** + * @brief returns the bottom vecs for each layer -- usually you won't + * need this unless you do per-layer checks such as gradients. + */ + inline const vector*> >& bottom_vecs() const { + return bottom_vecs_; + } + /** + * @brief returns the top vecs for each layer -- usually you won't + * need this unless you do per-layer checks such as gradients. + */ + inline const vector*> >& top_vecs() const { + return top_vecs_; + } + inline const vector >& bottom_need_backward() const { + return bottom_need_backward_; + } + inline const vector& blob_loss_weights() const { + return blob_loss_weights_; + } + inline const vector& layer_need_backward() const { + return layer_need_backward_; + } + /// @brief returns the parameters + inline const vector > >& params() const { + return params_; + } + /// @brief returns the parameter learning rate multipliers + inline const vector& params_lr() const { + return params_lr_; + } + inline const vector& params_weight_decay() const { + return params_weight_decay_; + } + const map& param_names_index() const { + return param_names_index_; + } + inline const vector& param_owners() const { + return param_owners_; + } + /// @brief Input and output blob numbers + inline int num_inputs() const { + return net_input_blobs_.size(); + } + inline int num_outputs() const { + return net_output_blobs_.size(); + } + inline const vector*>& input_blobs() const { + return net_input_blobs_; + } + inline const vector*>& output_blobs() const { + return net_output_blobs_; + } + inline const vector& input_blob_indices() const { + return net_input_blob_indices_; + } + inline const vector& output_blob_indices() const { + return net_output_blob_indices_; + } + bool has_blob(const string& blob_name) const; + const shared_ptr > blob_by_name(const string& blob_name) const; + bool has_layer(const string& layer_name) const; + const shared_ptr > layer_by_name( + const string& layer_name) const; - void set_debug_info(const bool value) { debug_info_ = value; } + void set_debug_info(const bool value) { + debug_info_ = value; + } - // Helpers for Init. - /** - * @brief Remove layers that the user specified should be excluded given the current - * phase, level, and stage. - */ - static void FilterNet(const NetParameter& param, - NetParameter* param_filtered); - /// @brief return whether NetState state meets NetStateRule rule - static bool StateMeetsRule(const NetState& state, const NetStateRule& rule, - const string& layer_name); + // Helpers for Init. + /** + * @brief Remove layers that the user specified should be excluded given the current + * phase, level, and stage. + */ + static void FilterNet(const NetParameter& param, + NetParameter* param_filtered); + /// @brief return whether NetState state meets NetStateRule rule + static bool StateMeetsRule(const NetState& state, const NetStateRule& rule, + const string& layer_name); - protected: - // Helpers for Init. - /// @brief Append a new input or top blob to the net. - void AppendTop(const NetParameter& param, const int layer_id, - const int top_id, set* available_blobs, - map* blob_name_to_idx); - /// @brief Append a new bottom blob to the net. - int AppendBottom(const NetParameter& param, const int layer_id, - const int bottom_id, set* available_blobs, - map* blob_name_to_idx); - /// @brief Append a new parameter blob to the net. - void AppendParam(const NetParameter& param, const int layer_id, - const int param_id); + protected: + // Helpers for Init. + /// @brief Append a new input or top blob to the net. + void AppendTop(const NetParameter& param, const int layer_id, + const int top_id, set* available_blobs, + map* blob_name_to_idx); + /// @brief Append a new bottom blob to the net. + int AppendBottom(const NetParameter& param, const int layer_id, + const int bottom_id, set* available_blobs, + map* blob_name_to_idx); + /// @brief Append a new parameter blob to the net. + void AppendParam(const NetParameter& param, const int layer_id, + const int param_id); - /// @brief Helper for displaying debug info in Forward about input Blobs. - void InputDebugInfo(const int layer_id); - /// @brief Helper for displaying debug info in Forward. - void ForwardDebugInfo(const int layer_id); - /// @brief Helper for displaying debug info in Backward. - void BackwardDebugInfo(const int layer_id); - /// @brief Helper for displaying debug info in Update. - void UpdateDebugInfo(const int param_id); + /// @brief Helper for displaying debug info in Forward about input Blobs. + void InputDebugInfo(const int layer_id); + /// @brief Helper for displaying debug info in Forward. + void ForwardDebugInfo(const int layer_id); + /// @brief Helper for displaying debug info in Backward. + void BackwardDebugInfo(const int layer_id); + /// @brief Helper for displaying debug info in Update. + void UpdateDebugInfo(const int param_id); - /// @brief Get misc parameters, e.g. the LR multiplier and weight decay. - void GetLearningRateAndWeightDecay(); + /// @brief Get misc parameters, e.g. the LR multiplier and weight decay. + void GetLearningRateAndWeightDecay(); - /// @brief The network name - string name_; - /// @brief The phase: TRAIN or TEST - Phase phase_; - /// @brief Individual layers in the net - vector > > layers_; - vector layer_names_; - map layer_names_index_; - vector layer_need_backward_; - /// @brief the blobs storing intermediate results between the layer. - vector > > blobs_; - vector blob_names_; - map blob_names_index_; - vector blob_need_backward_; - /// bottom_vecs stores the vectors containing the input for each layer. - /// They don't actually host the blobs (blobs_ does), so we simply store - /// pointers. - vector*> > bottom_vecs_; - vector > bottom_id_vecs_; - vector > bottom_need_backward_; - /// top_vecs stores the vectors containing the output for each layer - vector*> > top_vecs_; - vector > top_id_vecs_; - /// Vector of weight in the loss (or objective) function of each net blob, - /// indexed by blob_id. - vector blob_loss_weights_; - vector > param_id_vecs_; - vector param_owners_; - vector param_display_names_; - vector > param_layer_indices_; - map param_names_index_; - /// blob indices for the input and the output of the net - vector net_input_blob_indices_; - vector net_output_blob_indices_; - vector*> net_input_blobs_; - vector*> net_output_blobs_; - /// The parameters in the network. - vector > > params_; - /// the learning rate multipliers - vector params_lr_; - /// the weight decay multipliers - vector params_weight_decay_; - /// The bytes of memory used by this net - size_t memory_used_; - /// Whether to compute and display debug info for the net. - bool debug_info_; + /// @brief The network name + string name_; + /// @brief The phase: TRAIN or TEST + Phase phase_; + /// @brief Individual layers in the net + vector > > layers_; + vector layer_names_; + map layer_names_index_; + vector layer_need_backward_; + /// @brief the blobs storing intermediate results between the layer. + vector > > blobs_; + vector blob_names_; + map blob_names_index_; + vector blob_need_backward_; + /// bottom_vecs stores the vectors containing the input for each layer. + /// They don't actually host the blobs (blobs_ does), so we simply store + /// pointers. + vector*> > bottom_vecs_; + vector > bottom_id_vecs_; + vector > bottom_need_backward_; + /// top_vecs stores the vectors containing the output for each layer + vector*> > top_vecs_; + vector > top_id_vecs_; + /// Vector of weight in the loss (or objective) function of each net blob, + /// indexed by blob_id. + vector blob_loss_weights_; + vector > param_id_vecs_; + vector param_owners_; + vector param_display_names_; + vector > param_layer_indices_; + map param_names_index_; + /// blob indices for the input and the output of the net + vector net_input_blob_indices_; + vector net_output_blob_indices_; + vector*> net_input_blobs_; + vector*> net_output_blobs_; + /// The parameters in the network. + vector > > params_; + /// the learning rate multipliers + vector params_lr_; + /// the weight decay multipliers + vector params_weight_decay_; + /// The bytes of memory used by this net + size_t memory_used_; + /// Whether to compute and display debug info for the net. + bool debug_info_; - DISABLE_COPY_AND_ASSIGN(Net); + DISABLE_COPY_AND_ASSIGN (Net); }; - } // namespace caffe #endif // CAFFE_NET_HPP_ diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp index c2e0774a..2a240a5f 100644 --- a/include/caffe/neuron_layers.hpp +++ b/include/caffe/neuron_layers.hpp @@ -9,6 +9,7 @@ #include "caffe/common.hpp" #include "caffe/layer.hpp" #include "caffe/proto/caffe.pb.h" +#include "caffe/util/ocl_wrapper.hpp" #define HDF5_DATA_DATASET_NAME "data" #define HDF5_DATA_LABEL_NAME "label" @@ -22,15 +23,20 @@ namespace caffe { * element. */ template -class NeuronLayer : public Layer { - public: - explicit NeuronLayer(const LayerParameter& param) - : Layer(param) {} - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } +class NeuronLayer: public Layer { + public: + explicit NeuronLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } }; /** @@ -44,45 +50,52 @@ class NeuronLayer : public Layer { * the computed outputs @f$ y = |x| @f$ */ template -class AbsValLayer : public NeuronLayer { - public: - explicit AbsValLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "AbsVal"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - /// @copydoc AbsValLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the absolute value inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \mathrm{sign}(x) \frac{\partial E}{\partial y} - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +class AbsValLayer: public NeuronLayer { + public: + explicit AbsValLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "AbsVal"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + /// @copydoc AbsValLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the absolute value inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \mathrm{sign}(x) \frac{\partial E}{\partial y} + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -103,40 +116,43 @@ class AbsValLayer : public NeuronLayer { * @f$ */ template -class BNLLLayer : public NeuronLayer { - public: - explicit BNLLLayer(const LayerParameter& param) - : NeuronLayer(param) {} - - virtual inline const char* type() const { return "BNLL"; } - - protected: - /// @copydoc BNLLLayer - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the BNLL inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 2) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +class BNLLLayer: public NeuronLayer { + public: + explicit BNLLLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual inline const char* type() const { + return "BNLL"; + } + + protected: + /// @copydoc BNLLLayer + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the BNLL inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 2) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; /** @@ -151,56 +167,60 @@ class BNLLLayer : public NeuronLayer { * the computed outputs @f$ y = |x| @f$ */ template -class DropoutLayer : public NeuronLayer { - public: - /** - * @param param provides DropoutParameter dropout_param, - * with DropoutLayer options: - * - dropout_ratio (\b optional, default 0.5). - * Sets the probability @f$ p @f$ that any given unit is dropped. - */ - explicit DropoutLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Dropout"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs. At training time, we have @f$ - * y_{\mbox{train}} = \left\{ - * \begin{array}{ll} - * \frac{x}{1 - p} & \mbox{if } u > p \\ +class DropoutLayer: public NeuronLayer { + public: + /** + * @param param provides DropoutParameter dropout_param, + * with DropoutLayer options: + * - dropout_ratio (\b optional, default 0.5). + * Sets the probability @f$ p @f$ that any given unit is dropped. + */ + explicit DropoutLayer(const LayerParameter& param) + : + NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Dropout"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs. At training time, we have @f$ + * y_{\mbox{train}} = \left\{ + * \begin{array}{ll} + * \frac{x}{1 - p} & \mbox{if } u > p \\ * 0 & \mbox{otherwise} - * \end{array} \right. - * @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each - * input at each iteration. At test time, we simply have - * @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$. - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$ - Blob rand_vec_; - /// the probability @f$ p @f$ of dropping any input - Dtype threshold_; - /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$ - Dtype scale_; - unsigned int uint_thres_; + * \end{array} \right. + * @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each + * input at each iteration. At test time, we simply have + * @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$ + Blob rand_vec_; + /// the probability @f$ p @f$ of dropping any input + Dtype threshold_; + /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$ + Dtype scale_; + unsigned int uint_thres_; }; /** @@ -209,62 +229,65 @@ class DropoutLayer : public NeuronLayer { * and base @f$ \gamma @f$. */ template -class ExpLayer : public NeuronLayer { - public: - /** - * @param param provides ExpParameter exp_param, - * with ExpLayer options: - * - scale (\b optional, default 1) the scale @f$ \alpha @f$ - * - shift (\b optional, default 0) the shift @f$ \beta @f$ - * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) - * the base @f$ \gamma @f$ - */ - explicit ExpLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Exp"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \gamma ^ {\alpha x + \beta} - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the exp inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Dtype inner_scale_, outer_scale_; +class ExpLayer: public NeuronLayer { + public: + /** + * @param param provides ExpParameter exp_param, + * with ExpLayer options: + * - scale (\b optional, default 1) the scale @f$ \alpha @f$ + * - shift (\b optional, default 0) the shift @f$ \beta @f$ + * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) + * the base @f$ \gamma @f$ + */ + explicit ExpLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Exp"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \gamma ^ {\alpha x + \beta} + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the exp inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Dtype inner_scale_, outer_scale_; }; /** @@ -273,64 +296,67 @@ class ExpLayer : public NeuronLayer { * and base @f$ \gamma @f$. */ template -class LogLayer : public NeuronLayer { - public: - /** - * @param param provides LogParameter log_param, - * with LogLayer options: - * - scale (\b optional, default 1) the scale @f$ \alpha @f$ - * - shift (\b optional, default 0) the shift @f$ \beta @f$ - * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) - * the base @f$ \gamma @f$ - */ - explicit LogLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Log"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = log_{\gamma}(\alpha x + \beta) - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the exp inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - Dtype base_scale_; - Dtype input_scale_, input_shift_; - Dtype backward_num_scale_; +class LogLayer: public NeuronLayer { + public: + /** + * @param param provides LogParameter log_param, + * with LogLayer options: + * - scale (\b optional, default 1) the scale @f$ \alpha @f$ + * - shift (\b optional, default 0) the shift @f$ \beta @f$ + * - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$) + * the base @f$ \gamma @f$ + */ + explicit LogLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Log"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = log_{\gamma}(\alpha x + \beta) + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the exp inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \frac{\partial E}{\partial y} y \alpha \log_e(gamma) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + Dtype base_scale_; + Dtype input_scale_, input_shift_; + Dtype backward_num_scale_; }; /** @@ -339,71 +365,74 @@ class LogLayer : public NeuronLayer { * and power @f$ \gamma @f$. */ template -class PowerLayer : public NeuronLayer { - public: - /** - * @param param provides PowerParameter power_param, - * with PowerLayer options: - * - scale (\b optional, default 1) the scale @f$ \alpha @f$ - * - shift (\b optional, default 0) the shift @f$ \beta @f$ - * - power (\b optional, default 1) the power @f$ \gamma @f$ - */ - explicit PowerLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Power"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = (\alpha x + \beta) ^ \gamma - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the power inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = - * \frac{\partial E}{\partial y} - * \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} = - * \frac{\partial E}{\partial y} - * \frac{\alpha \gamma y}{\alpha x + \beta} - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - /// @brief @f$ \gamma @f$ from layer_param_.power_param() - Dtype power_; - /// @brief @f$ \alpha @f$ from layer_param_.power_param() - Dtype scale_; - /// @brief @f$ \beta @f$ from layer_param_.power_param() - Dtype shift_; - /// @brief Result of @f$ \alpha \gamma @f$ - Dtype diff_scale_; +class PowerLayer: public NeuronLayer { + public: + /** + * @param param provides PowerParameter power_param, + * with PowerLayer options: + * - scale (\b optional, default 1) the scale @f$ \alpha @f$ + * - shift (\b optional, default 0) the shift @f$ \beta @f$ + * - power (\b optional, default 1) the power @f$ \gamma @f$ + */ + explicit PowerLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Power"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = (\alpha x + \beta) ^ \gamma + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the power inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = + * \frac{\partial E}{\partial y} + * \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} = + * \frac{\partial E}{\partial y} + * \frac{\alpha \gamma y}{\alpha x + \beta} + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// @brief @f$ \gamma @f$ from layer_param_.power_param() + Dtype power_; + /// @brief @f$ \alpha @f$ from layer_param_.power_param() + Dtype scale_; + /// @brief @f$ \beta @f$ from layer_param_.power_param() + Dtype shift_; + /// @brief Result of @f$ \alpha \gamma @f$ + Dtype diff_scale_; }; /** @@ -411,68 +440,70 @@ class PowerLayer : public NeuronLayer { * The simple max is fast to compute, and the function does not saturate. */ template -class ReLULayer : public NeuronLayer { - public: - /** - * @param param provides ReLUParameter relu_param, - * with ReLULayer options: - * - negative_slope (\b optional, default 0). - * the value @f$ \nu @f$ by which negative values are multiplied. - */ - explicit ReLULayer(const LayerParameter& param) - : NeuronLayer(param) {} - - virtual inline const char* type() const { return "ReLU"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \max(0, x) - * @f$ by default. If a non-zero negative_slope @f$ \nu @f$ is provided, - * the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$. - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the ReLU inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} = \left\{ - * \begin{array}{lr} - * 0 & \mathrm{if} \; x \le 0 \\ +class ReLULayer: public NeuronLayer { + public: + /** + * @param param provides ReLUParameter relu_param, + * with ReLULayer options: + * - negative_slope (\b optional, default 0). + * the value @f$ \nu @f$ by which negative values are multiplied. + */ + explicit ReLULayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual inline const char* type() const { + return "ReLU"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \max(0, x) + * @f$ by default. If a non-zero negative_slope @f$ \nu @f$ is provided, + * the computed outputs are @f$ y = \max(0, x) + \nu \min(0, x) @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the ReLU inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} = \left\{ + * \begin{array}{lr} + * 0 & \mathrm{if} \; x \le 0 \\ * \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0 - * \end{array} \right. - * @f$ if propagate_down[0], by default. - * If a non-zero negative_slope @f$ \nu @f$ is provided, - * the computed gradients are @f$ - * \frac{\partial E}{\partial x} = \left\{ - * \begin{array}{lr} - * \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\ + * \end{array} \right. + * @f$ if propagate_down[0], by default. + * If a non-zero negative_slope @f$ \nu @f$ is provided, + * the computed gradients are @f$ + * \frac{\partial E}{\partial x} = \left\{ + * \begin{array}{lr} + * \nu \frac{\partial E}{\partial y} & \mathrm{if} \; x \le 0 \\ * \frac{\partial E}{\partial y} & \mathrm{if} \; x > 0 - * \end{array} \right. - * @f$. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + * \end{array} \right. + * @f$. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -481,23 +512,23 @@ class ReLULayer : public NeuronLayer { */ template class CuDNNReLULayer : public ReLULayer { - public: + public: explicit CuDNNReLULayer(const LayerParameter& param) - : ReLULayer(param), handles_setup_(false) {} + : ReLULayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, const vector*>& top); virtual ~CuDNNReLULayer(); - protected: + protected: virtual void Forward_gpu(const vector*>& bottom, const vector*>& top); virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); bool handles_setup_; - cudnnHandle_t handle_; + cudnnHandle_t handle_; cudnnTensorDescriptor_t bottom_desc_; cudnnTensorDescriptor_t top_desc_; }; @@ -512,50 +543,53 @@ class CuDNNReLULayer : public ReLULayer { * The ReLULayer is often a better choice for this reason. */ template -class SigmoidLayer : public NeuronLayer { - public: - explicit SigmoidLayer(const LayerParameter& param) - : NeuronLayer(param) {} - - virtual inline const char* type() const { return "Sigmoid"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = (1 + \exp(-x))^{-1} - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the sigmoid inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} - * = \frac{\partial E}{\partial y} y (1 - y) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +class SigmoidLayer: public NeuronLayer { + public: + explicit SigmoidLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual inline const char* type() const { + return "Sigmoid"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = (1 + \exp(-x))^{-1} + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the sigmoid inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} + * = \frac{\partial E}{\partial y} y (1 - y) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -564,23 +598,23 @@ class SigmoidLayer : public NeuronLayer { */ template class CuDNNSigmoidLayer : public SigmoidLayer { - public: + public: explicit CuDNNSigmoidLayer(const LayerParameter& param) - : SigmoidLayer(param), handles_setup_(false) {} + : SigmoidLayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, const vector*>& top); virtual ~CuDNNSigmoidLayer(); - protected: + protected: virtual void Forward_gpu(const vector*>& bottom, const vector*>& top); virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); bool handles_setup_; - cudnnHandle_t handle_; + cudnnHandle_t handle_; cudnnTensorDescriptor_t bottom_desc_; cudnnTensorDescriptor_t top_desc_; }; @@ -595,52 +629,55 @@ class CuDNNSigmoidLayer : public SigmoidLayer { * The ReLULayer is often a better choice for this reason. */ template -class TanHLayer : public NeuronLayer { - public: - explicit TanHLayer(const LayerParameter& param) - : NeuronLayer(param) {} - - virtual inline const char* type() const { return "TanH"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \frac{\exp(2x) - 1}{\exp(2x) + 1} - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the sigmoid inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times H \times W) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$; Backward fills their diff with - * gradients @f$ - * \frac{\partial E}{\partial x} - * = \frac{\partial E}{\partial y} - * \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right) - * = \frac{\partial E}{\partial y} (1 - y^2) - * @f$ if propagate_down[0] - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +class TanHLayer: public NeuronLayer { + public: + explicit TanHLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual inline const char* type() const { + return "TanH"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \frac{\exp(2x) - 1}{\exp(2x) + 1} + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the sigmoid inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times H \times W) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$; Backward fills their diff with + * gradients @f$ + * \frac{\partial E}{\partial x} + * = \frac{\partial E}{\partial y} + * \left(1 - \left[\frac{\exp(2x) - 1}{exp(2x) + 1} \right]^2 \right) + * = \frac{\partial E}{\partial y} (1 - y^2) + * @f$ if propagate_down[0] + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); }; #ifdef USE_CUDNN @@ -649,23 +686,23 @@ class TanHLayer : public NeuronLayer { */ template class CuDNNTanHLayer : public TanHLayer { - public: + public: explicit CuDNNTanHLayer(const LayerParameter& param) - : TanHLayer(param), handles_setup_(false) {} + : TanHLayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, const vector*>& top); virtual ~CuDNNTanHLayer(); - protected: + protected: virtual void Forward_gpu(const vector*>& bottom, const vector*>& top); virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); bool handles_setup_; - cudnnHandle_t handle_; + cudnnHandle_t handle_; cudnnTensorDescriptor_t bottom_desc_; cudnnTensorDescriptor_t top_desc_; }; @@ -676,47 +713,51 @@ class CuDNNTanHLayer : public TanHLayer { * above threshold; 0 otherwise. */ template -class ThresholdLayer : public NeuronLayer { - public: - /** - * @param param provides ThresholdParameter threshold_param, - * with ThresholdLayer options: - * - threshold (\b optional, default 0). - * the threshold value @f$ t @f$ to which the input values are compared. - */ - explicit ThresholdLayer(const LayerParameter& param) - : NeuronLayer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Threshold"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times H \times W) @f$ - * the computed outputs @f$ - * y = \left\{ - * \begin{array}{lr} - * 0 & \mathrm{if} \; x \le t \\ +class ThresholdLayer: public NeuronLayer { + public: + /** + * @param param provides ThresholdParameter threshold_param, + * with ThresholdLayer options: + * - threshold (\b optional, default 0). + * the threshold value @f$ t @f$ to which the input values are compared. + */ + explicit ThresholdLayer(const LayerParameter& param) + : NeuronLayer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Threshold"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times H \times W) @f$ + * the computed outputs @f$ + * y = \left\{ + * \begin{array}{lr} + * 0 & \mathrm{if} \; x \le t \\ * 1 & \mathrm{if} \; x > t - * \end{array} \right. - * @f$ - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - /// @brief Not implemented (non-differentiable function) - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - NOT_IMPLEMENTED; - } - - Dtype threshold_; + * \end{array} \right. + * @f$ + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + /// @brief Not implemented (non-differentiable function) + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + NOT_IMPLEMENTED; + } + + Dtype threshold_; }; /** @@ -728,80 +769,83 @@ class ThresholdLayer : public NeuronLayer { * equal to 2. The 1st axis (0-based) is seen as channels. */ template -class PReLULayer : public NeuronLayer { - public: - /** - * @param param provides PReLUParameter prelu_param, - * with PReLULayer options: - * - filler (\b optional, FillerParameter, - * default {'type': constant 'value':0.25}). - * - channel_shared (\b optional, default false). - * negative slopes are shared across channels. - */ - explicit PReLULayer(const LayerParameter& param) - : NeuronLayer(param) {} - - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "PReLU"; } - - protected: - /** - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times ...) @f$ - * the inputs @f$ x @f$ - * @param top output Blob vector (length 1) - * -# @f$ (N \times C \times ...) @f$ - * the computed outputs for each channel @f$i@f$ @f$ - * y_i = \max(0, x_i) + a_i \min(0, x_i) - * @f$. - */ - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - - /** - * @brief Computes the error gradient w.r.t. the PReLU inputs. - * - * @param top output Blob vector (length 1), providing the error gradient with - * respect to the outputs - * -# @f$ (N \times C \times ...) @f$ - * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ - * with respect to computed outputs @f$ y @f$ - * @param propagate_down see Layer::Backward. - * @param bottom input Blob vector (length 1) - * -# @f$ (N \times C \times ...) @f$ - * the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their - * diff with gradients @f$ - * \frac{\partial E}{\partial x_i} = \left\{ - * \begin{array}{lr} - * a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ +class PReLULayer: public NeuronLayer { + public: + /** + * @param param provides PReLUParameter prelu_param, + * with PReLULayer options: + * - filler (\b optional, FillerParameter, + * default {'type': constant 'value':0.25}). + * - channel_shared (\b optional, default false). + * negative slopes are shared across channels. + */ + explicit PReLULayer(const LayerParameter& param) + : NeuronLayer(param) { + } + + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "PReLU"; + } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times ...) @f$ + * the inputs @f$ x @f$ + * @param top output Blob vector (length 1) + * -# @f$ (N \times C \times ...) @f$ + * the computed outputs for each channel @f$i@f$ @f$ + * y_i = \max(0, x_i) + a_i \min(0, x_i) + * @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the PReLU inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * -# @f$ (N \times C \times ...) @f$ + * containing error gradients @f$ \frac{\partial E}{\partial y} @f$ + * with respect to computed outputs @f$ y @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 1) + * -# @f$ (N \times C \times ...) @f$ + * the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their + * diff with gradients @f$ + * \frac{\partial E}{\partial x_i} = \left\{ + * \begin{array}{lr} + * a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ * \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i > 0 - * \end{array} \right. - * @f$. - * If param_propagate_down_[0] is true, it fills the diff with gradients - * @f$ - * \frac{\partial E}{\partial a_i} = \left\{ - * \begin{array}{lr} - * \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ + * \end{array} \right. + * @f$. + * If param_propagate_down_[0] is true, it fills the diff with gradients + * @f$ + * \frac{\partial E}{\partial a_i} = \left\{ + * \begin{array}{lr} + * \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\ * 0 & \mathrm{if} \; x_i > 0 - * \end{array} \right. - * @f$. - */ - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - bool channel_shared_; - Blob multiplier_; // dot multiplier for backward computation of params - Blob backward_buff_; // temporary buffer for backward computation - Blob bottom_memory_; // memory for in-place computation + * \end{array} \right. + * @f$. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + bool channel_shared_; + Blob multiplier_; // dot multiplier for backward computation of params + Blob backward_buff_; // temporary buffer for backward computation + Blob bottom_memory_; // memory for in-place computation }; } // namespace caffe diff --git a/include/caffe/python_layer.hpp b/include/caffe/python_layer.hpp index 19cf18c9..16d1f7fc 100644 --- a/include/caffe/python_layer.hpp +++ b/include/caffe/python_layer.hpp @@ -11,55 +11,59 @@ namespace bp = boost::python; namespace caffe { template -class PythonLayer : public Layer { - public: - PythonLayer(PyObject* self, const LayerParameter& param) - : Layer(param), self_(bp::handle<>(bp::borrowed(self))) { } +class PythonLayer: public Layer { + public: + PythonLayer(PyObject* self, const LayerParameter& param) + : Layer(param), self_(bp::handle<>(bp::borrowed(self))) { + } - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top) { - try { - self_.attr("setup")(bottom, top); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top) { + try { + self_.attr("setup")(bottom, top); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } } - } - virtual void Reshape(const vector*>& bottom, - const vector*>& top) { - try { - self_.attr("reshape")(bottom, top); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; + virtual void Reshape(const vector*>& bottom, + const vector*>& top) { + try { + self_.attr("reshape")(bottom, top); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } } - } - virtual inline const char* type() const { return "Python"; } + virtual inline const char* type() const { + return "Python"; + } - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top) { - try { - self_.attr("forward")(bottom, top); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) { + try { + self_.attr("forward")(bottom, top); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } } - } - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - try { - self_.attr("backward")(top, propagate_down, bottom); - } catch (bp::error_already_set) { - PyErr_Print(); - throw; + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + try { + self_.attr("backward")(top, propagate_down, bottom); + } catch (bp::error_already_set) { + PyErr_Print(); + throw; + } } - } - private: - bp::object self_; + private: + bp::object self_; }; } // namespace caffe diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index c2ced487..2bddb77f 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -16,116 +16,150 @@ namespace caffe { */ template class Solver { - public: - explicit Solver(const SolverParameter& param); - explicit Solver(const string& param_file); - void Init(const SolverParameter& param); - void InitTrainNet(); - void InitTestNets(); - // The main entry of the solver function. In default, iter will be zero. Pass - // in a non-zero iter number to resume training for a pre-trained net. - virtual void Solve(const char* resume_file = NULL); - inline void Solve(const string resume_file) { Solve(resume_file.c_str()); } - void Step(int iters); - // The Restore function implements how one should restore the solver to a - // previously snapshotted state. You should implement the RestoreSolverState() - // function that restores the state from a SolverState protocol buffer. - void Restore(const char* resume_file); - virtual ~Solver() {} - inline shared_ptr > net() { return net_; } - inline const vector > >& test_nets() { - return test_nets_; - } - int iter() { return iter_; } - - protected: - // Make and apply the update value for the current iteration. - virtual void ApplyUpdate() = 0; - // The Solver::Snapshot function implements the basic snapshotting utility - // that stores the learned net. You should implement the SnapshotSolverState() - // function that produces a SolverState protocol buffer that needs to be - // written to disk together with the learned net. - void Snapshot(); - // The test routine - void TestAll(); - void Test(const int test_net_id = 0); - virtual void SnapshotSolverState(SolverState* state) = 0; - virtual void RestoreSolverState(const SolverState& state) = 0; - void DisplayOutputBlobs(const int net_id); - - SolverParameter param_; - int iter_; - int current_step_; - shared_ptr > net_; - vector > > test_nets_; - - DISABLE_COPY_AND_ASSIGN(Solver); + public: + explicit Solver(const SolverParameter& param); + explicit Solver(const string& param_file); + void Init(const SolverParameter& param); + void InitTrainNet(); + void InitTestNets(); + // The main entry of the solver function. In default, iter will be zero. Pass + // in a non-zero iter number to resume training for a pre-trained net. + virtual void Solve(const char* resume_file = NULL); + inline void Solve(const string resume_file) { + Solve(resume_file.c_str()); + } + void Step(int iters); + // The Restore function implements how one should restore the solver to a + // previously snapshotted state. You should implement the RestoreSolverState() + // function that restores the state from a SolverState protocol buffer. + void Restore(const char* resume_file); + virtual ~Solver() { + } + inline shared_ptr > net() { + return net_; + } + inline const vector > >& test_nets() { + return test_nets_; + } + int iter() { + return iter_; + } + + protected: + // Make and apply the update value for the current iteration. + virtual void ApplyUpdate() = 0; + // The Solver::Snapshot function implements the basic snapshotting utility + // that stores the learned net. You should implement the SnapshotSolverState() + // function that produces a SolverState protocol buffer that needs to be + // written to disk together with the learned net. + void Snapshot(); + // The test routine + void TestAll(); + void Test(const int test_net_id = 0); + virtual void SnapshotSolverState(SolverState* state) = 0; + virtual void RestoreSolverState(const SolverState& state) = 0; + + void DisplayOutputBlobs(const int net_id); + + SolverParameter param_; + int iter_; + int current_step_; + shared_ptr > net_; + vector > > test_nets_; + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + + DISABLE_COPY_AND_ASSIGN (Solver); }; - /** * @brief Optimizes the parameters of a Net using * stochastic gradient descent (SGD) with momentum. */ template -class SGDSolver : public Solver { - public: - explicit SGDSolver(const SolverParameter& param) - : Solver(param) { PreSolve(); } - explicit SGDSolver(const string& param_file) - : Solver(param_file) { PreSolve(); } - - const vector > >& history() { return history_; } - - protected: - void PreSolve(); - Dtype GetLearningRate(); - virtual void ApplyUpdate(); - virtual void Normalize(int param_id); - virtual void Regularize(int param_id); - virtual void ComputeUpdateValue(int param_id, Dtype rate); - virtual void ClipGradients(); - virtual void SnapshotSolverState(SolverState * state); - virtual void RestoreSolverState(const SolverState& state); - // history maintains the historical momentum data. - // update maintains update related data and is not needed in snapshots. - // temp maintains other information that might be needed in computation - // of gradients/updates and is not needed in snapshots - vector > > history_, update_, temp_; - - DISABLE_COPY_AND_ASSIGN(SGDSolver); +class SGDSolver: public Solver { + public: + explicit SGDSolver(const SolverParameter& param) + : Solver(param) { + PreSolve(); + } + explicit SGDSolver(const string& param_file) + : Solver(param_file) { + PreSolve(); + } + + const vector > >& history() { + return history_; + } + + protected: + void PreSolve(); + Dtype GetLearningRate(); + virtual void ApplyUpdate(); + virtual void Normalize(int param_id); + virtual void Regularize(int param_id); + virtual void ComputeUpdateValue(int param_id, Dtype rate); + virtual void ClipGradients(); + virtual void SnapshotSolverState(SolverState * state); + virtual void RestoreSolverState(const SolverState& state); + // history maintains the historical momentum data. + // update maintains update related data and is not needed in snapshots. + // temp maintains other information that might be needed in computation + // of gradients/updates and is not needed in snapshots + vector > > history_, update_, temp_; + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + + DISABLE_COPY_AND_ASSIGN (SGDSolver); }; template -class NesterovSolver : public SGDSolver { - public: - explicit NesterovSolver(const SolverParameter& param) - : SGDSolver(param) {} - explicit NesterovSolver(const string& param_file) - : SGDSolver(param_file) {} - - protected: - virtual void ComputeUpdateValue(int param_id, Dtype rate); - - DISABLE_COPY_AND_ASSIGN(NesterovSolver); +class NesterovSolver: public SGDSolver { + public: + explicit NesterovSolver(const SolverParameter& param) + : SGDSolver(param) { + } + explicit NesterovSolver(const string& param_file) + : SGDSolver(param_file) { + } + + protected: + virtual void ComputeUpdateValue(int param_id, Dtype rate); + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + + DISABLE_COPY_AND_ASSIGN (NesterovSolver); }; template -class AdaGradSolver : public SGDSolver { - public: - explicit AdaGradSolver(const SolverParameter& param) - : SGDSolver(param) { constructor_sanity_check(); } - explicit AdaGradSolver(const string& param_file) - : SGDSolver(param_file) { constructor_sanity_check(); } - - protected: - virtual void ComputeUpdateValue(int param_id, Dtype rate); - void constructor_sanity_check() { - CHECK_EQ(0, this->param_.momentum()) - << "Momentum cannot be used with AdaGrad."; - } - - DISABLE_COPY_AND_ASSIGN(AdaGradSolver); +class AdaGradSolver: public SGDSolver { + public: + explicit AdaGradSolver(const SolverParameter& param) + : SGDSolver(param) { + constructor_sanity_check(); + } + explicit AdaGradSolver(const string& param_file) + : SGDSolver(param_file) { + constructor_sanity_check(); + } + + protected: + virtual void ComputeUpdateValue(int param_id, Dtype rate); + void constructor_sanity_check() { + CHECK_EQ(0, this->param_.momentum()) + << "Momentum cannot be used with AdaGrad."; + } + + void ocl_setup(); + protected: + cl_kernel scalar_kernel, add_kernel, div_kernel, powx_kernel; + DISABLE_COPY_AND_ASSIGN (AdaGradSolver); }; template @@ -134,13 +168,13 @@ Solver* GetSolver(const SolverParameter& param) { switch (type) { case SolverParameter_SolverType_SGD: - return new SGDSolver(param); + return new SGDSolver(param); case SolverParameter_SolverType_NESTEROV: - return new NesterovSolver(param); + return new NesterovSolver(param); case SolverParameter_SolverType_ADAGRAD: - return new AdaGradSolver(param); + return new AdaGradSolver(param); default: - LOG(FATAL) << "Unknown SolverType: " << type; + LOG(FATAL) << "Unknown SolverType: " << type; } return (Solver*) NULL; } diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 1b726de9..4092b5ac 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #ifndef CAFFE_SYNCEDMEM_HPP_ #define CAFFE_SYNCEDMEM_HPP_ @@ -31,7 +57,6 @@ inline void CaffeFreeHost(void* ptr) { free(ptr); } - /** * @brief Manages memory allocation and synchronization between the host (CPU) * and device (GPU). @@ -39,35 +64,62 @@ inline void CaffeFreeHost(void* ptr) { * TODO(dox): more thorough description. */ class SyncedMemory { - public: - SyncedMemory() - : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), - own_cpu_data_(false) {} - explicit SyncedMemory(size_t size) - : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), - own_cpu_data_(false) {} - ~SyncedMemory(); - const void* cpu_data(); - void set_cpu_data(void* data); - const void* gpu_data(); - void* mutable_cpu_data(); - void* mutable_gpu_data(); - enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; - SyncedHead head() { return head_; } - size_t size() { return size_; } + public: + SyncedMemory() + : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED), own_cpu_data_( + false), data_layer_(false) { +#ifndef CPU_ONLY + ocl_setup(); +#endif + } + explicit SyncedMemory(size_t size) + : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED), own_cpu_data_( + false), data_layer_(false) { +#ifndef CPU_ONLY + ocl_setup(); +#endif + } - private: - void to_cpu(); - void to_gpu(); - void* cpu_ptr_; - void* gpu_ptr_; - size_t size_; - SyncedHead head_; - bool own_cpu_data_; + ~SyncedMemory(); + const void* cpu_data(); + void set_cpu_data(void* data); + const void* gpu_data(); + const void* gpu_cache_data(); + void* mutable_cpu_data(); + void* mutable_gpu_data(); + enum SyncedHead { + UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED + }; + SyncedHead head() { + return head_; + } + size_t size() { + return size_; + } + void set_data_layer() { + data_layer_ = true; + } +#ifndef CPU_ONLY + private: + void ocl_setup(); +#endif + protected: + cl_kernel oclmem_kernel; - DISABLE_COPY_AND_ASSIGN(SyncedMemory); -}; // class SyncedMemory + private: + void to_cpu(); + void to_gpu(); + void* cpu_ptr_; + void* gpu_ptr_; + void* gpu_cache_ptr_; + size_t size_; + SyncedHead head_; + bool own_cpu_data_; + bool data_layer_; + DISABLE_COPY_AND_ASSIGN (SyncedMemory); +}; +// class SyncedMemory -} // namespace caffe +}// namespace caffe #endif // CAFFE_SYNCEDMEM_HPP_ diff --git a/include/caffe/test/.test_gradient_check_util.hpp.swo b/include/caffe/test/.test_gradient_check_util.hpp.swo new file mode 100644 index 00000000..e3ebfc99 Binary files /dev/null and b/include/caffe/test/.test_gradient_check_util.hpp.swo differ diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp index fc156091..401e2136 100644 --- a/include/caffe/test/test_caffe_main.hpp +++ b/include/caffe/test/test_caffe_main.hpp @@ -15,12 +15,12 @@ using std::cout; using std::endl; #ifdef CMAKE_BUILD - #include "caffe_config.h" +#include "caffe_config.h" #else - #define CUDA_TEST_DEVICE -1 - #define CMAKE_SOURCE_DIR "src/" - #define EXAMPLES_SOURCE_DIR "examples/" - #define CMAKE_EXT "" +#define OPENCL_TEST_DEVICE -1 +#define CMAKE_SOURCE_DIR "src/" +#define EXAMPLES_SOURCE_DIR "examples/" +#define CMAKE_EXT "" #endif int main(int argc, char** argv); @@ -28,48 +28,48 @@ int main(int argc, char** argv); namespace caffe { template -class MultiDeviceTest : public ::testing::Test { - public: - typedef typename TypeParam::Dtype Dtype; - protected: - MultiDeviceTest() { - Caffe::set_mode(TypeParam::device); - } - virtual ~MultiDeviceTest() {} +class MultiDeviceTest: public ::testing::Test { + public: + typedef typename TypeParam::Dtype Dtype; + protected: + MultiDeviceTest() { + Caffe::set_mode(TypeParam::device); + } + virtual ~MultiDeviceTest() { + } }; typedef ::testing::Types TestDtypes; template struct CPUDevice { - typedef TypeParam Dtype; - static const Caffe::Brew device = Caffe::CPU; + typedef TypeParam Dtype; + static const Caffe::Brew device = Caffe::CPU; }; template -class CPUDeviceTest : public MultiDeviceTest > { +class CPUDeviceTest: public MultiDeviceTest > { }; #ifdef CPU_ONLY typedef ::testing::Types, - CPUDevice > TestDtypesAndDevices; +CPUDevice > TestDtypesAndDevices; #else template struct GPUDevice { - typedef TypeParam Dtype; - static const Caffe::Brew device = Caffe::GPU; + typedef TypeParam Dtype; + static const Caffe::Brew device = Caffe::GPU; }; template -class GPUDeviceTest : public MultiDeviceTest > { +class GPUDeviceTest: public MultiDeviceTest > { }; -typedef ::testing::Types, CPUDevice, - GPUDevice, GPUDevice > - TestDtypesAndDevices; +typedef ::testing::Types, CPUDevice, GPUDevice, + GPUDevice > TestDtypesAndDevices; #endif diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp index cc5dcbad..081ce203 100644 --- a/include/caffe/test/test_gradient_check_util.hpp +++ b/include/caffe/test/test_gradient_check_util.hpp @@ -17,56 +17,57 @@ namespace caffe { // top blobs, and checks the gradient. template class GradientChecker { - public: - // kink and kink_range specify an ignored nonsmooth region of the form - // kink - kink_range <= |feature value| <= kink + kink_range, - // which accounts for all nonsmoothness in use by caffe - GradientChecker(const Dtype stepsize, const Dtype threshold, - const unsigned int seed = 1701, const Dtype kink = 0., - const Dtype kink_range = -1) - : stepsize_(stepsize), threshold_(threshold), seed_(seed), - kink_(kink), kink_range_(kink_range) {} - // Checks the gradient of a layer, with provided bottom layers and top - // layers. - // Note that after the gradient check, we do not guarantee that the data - // stored in the layer parameters and the blobs are unchanged. - void CheckGradient(Layer* layer, const vector*>& bottom, - const vector*>& top, int check_bottom = -1) { + public: + // kink and kink_range specify an ignored nonsmooth region of the form + // kink - kink_range <= |feature value| <= kink + kink_range, + // which accounts for all nonsmoothness in use by caffe + GradientChecker(const Dtype stepsize, const Dtype threshold, + const unsigned int seed = 1701, const Dtype kink = 0., + const Dtype kink_range = -1) + : stepsize_(stepsize), threshold_(threshold), seed_(seed), kink_(kink), kink_range_( + kink_range) { + } + // Checks the gradient of a layer, with provided bottom layers and top + // layers. + // Note that after the gradient check, we do not guarantee that the data + // stored in the layer parameters and the blobs are unchanged. + void CheckGradient(Layer* layer, const vector*>& bottom, + const vector*>& top, int check_bottom = -1) { layer->SetUp(bottom, top); CheckGradientSingle(layer, bottom, top, check_bottom, -1, -1); - } - void CheckGradientExhaustive(Layer* layer, - const vector*>& bottom, const vector*>& top, - int check_bottom = -1); + } + void CheckGradientExhaustive(Layer* layer, + const vector*>& bottom, const vector*>& top, + int check_bottom = -1); - // CheckGradientEltwise can be used to test layers that perform element-wise - // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when - // i != j. - void CheckGradientEltwise(Layer* layer, - const vector*>& bottom, const vector*>& top); + // CheckGradientEltwise can be used to test layers that perform element-wise + // computation only (e.g., neuron layers) -- where (d y_i) / (d x_j) = 0 when + // i != j. + void CheckGradientEltwise(Layer* layer, + const vector*>& bottom, const vector*>& top); - void CheckGradientSingle(Layer* layer, - const vector*>& bottom, const vector*>& top, - int check_bottom, int top_id, int top_data_id, bool element_wise = false); + void CheckGradientSingle(Layer* layer, + const vector*>& bottom, const vector*>& top, + int check_bottom, int top_id, int top_data_id, + bool element_wise = false); - // Checks the gradient of a network. This network should not have any data - // layers or loss layers, since the function does not explicitly deal with - // such cases yet. All input blobs and parameter blobs are going to be - // checked, layer-by-layer to avoid numerical problems to accumulate. - void CheckGradientNet(const Net& net, - const vector*>& input); + // Checks the gradient of a network. This network should not have any data + // layers or loss layers, since the function does not explicitly deal with + // such cases yet. All input blobs and parameter blobs are going to be + // checked, layer-by-layer to avoid numerical problems to accumulate. + void CheckGradientNet(const Net& net, + const vector*>& input); - protected: - Dtype GetObjAndGradient(const Layer& layer, - const vector*>& top, int top_id = -1, int top_data_id = -1); - Dtype stepsize_; - Dtype threshold_; - unsigned int seed_; - Dtype kink_; - Dtype kink_range_; + protected: + Dtype GetObjAndGradient(const Layer& layer, + const vector*>& top, int top_id = -1, int top_data_id = -1); + Dtype stepsize_; + Dtype threshold_; + unsigned int seed_; + Dtype kink_; + Dtype kink_range_; }; - template void GradientChecker::CheckGradientSingle(Layer* layer, const vector*>& bottom, const vector*>& top, @@ -107,8 +108,8 @@ void GradientChecker::CheckGradientSingle(Layer* layer, GetObjAndGradient(*layer, top, top_id, top_data_id); layer->Backward(top, propagate_down, bottom); // Store computed gradients for all checked blobs - vector > > - computed_gradient_blobs(blobs_to_check.size()); + vector < shared_ptr > + > computed_gradient_blobs(blobs_to_check.size()); for (int blob_id = 0; blob_id < blobs_to_check.size(); ++blob_id) { Blob* current_blob = blobs_to_check[blob_id]; computed_gradient_blobs[blob_id].reset(new Blob()); @@ -143,18 +144,18 @@ void GradientChecker::CheckGradientSingle(Layer* layer, current_blob->mutable_cpu_data()[feat_id] += stepsize_; Caffe::set_random_seed(seed_); layer->Forward(bottom, top); - positive_objective = - GetObjAndGradient(*layer, top, top_id, top_data_id); + positive_objective = GetObjAndGradient(*layer, top, top_id, + top_data_id); // Compute loss with stepsize_ subtracted from input. current_blob->mutable_cpu_data()[feat_id] -= stepsize_ * 2; Caffe::set_random_seed(seed_); layer->Forward(bottom, top); - negative_objective = - GetObjAndGradient(*layer, top, top_id, top_data_id); + negative_objective = GetObjAndGradient(*layer, top, top_id, + top_data_id); // Recover original input value. current_blob->mutable_cpu_data()[feat_id] += stepsize_; - estimated_gradient = (positive_objective - negative_objective) / - stepsize_ / 2.; + estimated_gradient = (positive_objective - negative_objective) + / stepsize_ / 2.; } Dtype computed_gradient = computed_gradients[feat_id]; Dtype feature = current_blob->cpu_data()[feat_id]; @@ -167,11 +168,10 @@ void GradientChecker::CheckGradientSingle(Layer* layer, Dtype scale = std::max( std::max(fabs(computed_gradient), fabs(estimated_gradient)), 1.); EXPECT_NEAR(computed_gradient, estimated_gradient, threshold_ * scale) - << "debug: (top_id, top_data_id, blob_id, feat_id)=" - << top_id << "," << top_data_id << "," << blob_id << "," << feat_id - << "; feat = " << feature - << "; objective+ = " << positive_objective - << "; objective- = " << negative_objective; + << "debug: (top_id, top_data_id, blob_id, feat_id)=" << top_id + << "," << top_data_id << "," << blob_id << "," << feat_id + << "; feat = " << feature << "; objective+ = " << positive_objective + << "; objective- = " << negative_objective; } // LOG(ERROR) << "Feature: " << current_blob->cpu_data()[feat_id]; // LOG(ERROR) << "computed gradient: " << computed_gradient @@ -211,11 +211,11 @@ void GradientChecker::CheckGradientEltwise(Layer* layer, } template -void GradientChecker::CheckGradientNet( - const Net& net, const vector*>& input) { +void GradientChecker::CheckGradientNet(const Net& net, + const vector*>& input) { const vector > >& layers = net.layers(); - vector*> >& bottom_vecs = net.bottom_vecs(); - vector*> >& top_vecs = net.top_vecs(); + vector < vector*> > &bottom_vecs = net.bottom_vecs(); + vector < vector*> > &top_vecs = net.top_vecs(); for (int i = 0; i < layers.size(); ++i) { net.Forward(input); LOG(ERROR) << "Checking gradient for " << layers[i]->layer_param().name(); diff --git a/include/caffe/util/benchmark.hpp b/include/caffe/util/benchmark.hpp index d6358277..f48be453 100644 --- a/include/caffe/util/benchmark.hpp +++ b/include/caffe/util/benchmark.hpp @@ -8,43 +8,50 @@ namespace caffe { class Timer { - public: - Timer(); - virtual ~Timer(); - virtual void Start(); - virtual void Stop(); - virtual float MilliSeconds(); - virtual float MicroSeconds(); - virtual float Seconds(); - - inline bool initted() { return initted_; } - inline bool running() { return running_; } - inline bool has_run_at_least_once() { return has_run_at_least_once_; } - - protected: - void Init(); - - bool initted_; - bool running_; - bool has_run_at_least_once_; + public: + Timer(); + virtual ~Timer(); + virtual void Start(); + virtual void Stop(); + virtual float MilliSeconds(); + virtual float MicroSeconds(); + virtual float Seconds(); + + inline bool initted() { + return initted_; + } + inline bool running() { + return running_; + } + inline bool has_run_at_least_once() { + return has_run_at_least_once_; + } + + protected: + void Init(); + + bool initted_; + bool running_; + bool has_run_at_least_once_; #ifndef CPU_ONLY - cudaEvent_t start_gpu_; - cudaEvent_t stop_gpu_; + //cudaEvent_t start_gpu_; + //cudaEvent_t stop_gpu_; #endif - boost::posix_time::ptime start_cpu_; - boost::posix_time::ptime stop_cpu_; - float elapsed_milliseconds_; - float elapsed_microseconds_; + boost::posix_time::ptime start_cpu_; + boost::posix_time::ptime stop_cpu_; + float elapsed_milliseconds_; + float elapsed_microseconds_; }; -class CPUTimer : public Timer { - public: - explicit CPUTimer(); - virtual ~CPUTimer() {} - virtual void Start(); - virtual void Stop(); - virtual float MilliSeconds(); - virtual float MicroSeconds(); +class CPUTimer: public Timer { + public: + explicit CPUTimer(); + virtual ~CPUTimer() { + } + virtual void Start(); + virtual void Stop(); + virtual float MilliSeconds(); + virtual float MicroSeconds(); }; } // namespace caffe diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp index b531dd5f..1994c48a 100644 --- a/include/caffe/util/cudnn.hpp +++ b/include/caffe/util/cudnn.hpp @@ -17,114 +17,114 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { switch (status) { case CUDNN_STATUS_SUCCESS: - return "CUDNN_STATUS_SUCCESS"; + return "CUDNN_STATUS_SUCCESS"; case CUDNN_STATUS_NOT_INITIALIZED: - return "CUDNN_STATUS_NOT_INITIALIZED"; + return "CUDNN_STATUS_NOT_INITIALIZED"; case CUDNN_STATUS_ALLOC_FAILED: - return "CUDNN_STATUS_ALLOC_FAILED"; + return "CUDNN_STATUS_ALLOC_FAILED"; case CUDNN_STATUS_BAD_PARAM: - return "CUDNN_STATUS_BAD_PARAM"; + return "CUDNN_STATUS_BAD_PARAM"; case CUDNN_STATUS_INTERNAL_ERROR: - return "CUDNN_STATUS_INTERNAL_ERROR"; + return "CUDNN_STATUS_INTERNAL_ERROR"; case CUDNN_STATUS_INVALID_VALUE: - return "CUDNN_STATUS_INVALID_VALUE"; + return "CUDNN_STATUS_INVALID_VALUE"; case CUDNN_STATUS_ARCH_MISMATCH: - return "CUDNN_STATUS_ARCH_MISMATCH"; + return "CUDNN_STATUS_ARCH_MISMATCH"; case CUDNN_STATUS_MAPPING_ERROR: - return "CUDNN_STATUS_MAPPING_ERROR"; + return "CUDNN_STATUS_MAPPING_ERROR"; case CUDNN_STATUS_EXECUTION_FAILED: - return "CUDNN_STATUS_EXECUTION_FAILED"; + return "CUDNN_STATUS_EXECUTION_FAILED"; case CUDNN_STATUS_NOT_SUPPORTED: - return "CUDNN_STATUS_NOT_SUPPORTED"; + return "CUDNN_STATUS_NOT_SUPPORTED"; case CUDNN_STATUS_LICENSE_ERROR: - return "CUDNN_STATUS_LICENSE_ERROR"; + return "CUDNN_STATUS_LICENSE_ERROR"; } return "Unknown cudnn status"; } namespace caffe { -namespace cudnn { - -template class dataType; -template<> class dataType { - public: - static const cudnnDataType_t type = CUDNN_DATA_FLOAT; - static float oneval, zeroval; - static const void *one, *zero; -}; -template<> class dataType { - public: - static const cudnnDataType_t type = CUDNN_DATA_DOUBLE; - static double oneval, zeroval; - static const void *one, *zero; -}; - -template -inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) { - CUDNN_CHECK(cudnnCreateTensorDescriptor(desc)); -} - -template -inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, - int n, int c, int h, int w, - int stride_n, int stride_c, int stride_h, int stride_w) { - CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType::type, - n, c, h, w, stride_n, stride_c, stride_h, stride_w)); -} - -template -inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, - int n, int c, int h, int w) { - const int stride_w = 1; - const int stride_h = w * stride_w; - const int stride_c = h * stride_h; - const int stride_n = c * stride_c; - setTensor4dDesc(desc, n, c, h, w, - stride_n, stride_c, stride_h, stride_w); -} - -template -inline void createFilterDesc(cudnnFilterDescriptor_t* desc, - int n, int c, int h, int w) { - CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); - CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType::type, - n, c, h, w)); -} - -template -inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) { - CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv)); -} - -template -inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv, - cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter, - int pad_h, int pad_w, int stride_h, int stride_w) { - CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv, - pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION)); -} - -template -inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc, - PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode, - int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) { - switch (poolmethod) { - case PoolingParameter_PoolMethod_MAX: - *mode = CUDNN_POOLING_MAX; - break; - case PoolingParameter_PoolMethod_AVE: - *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc)); - CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w, - pad_h, pad_w, stride_h, stride_w)); -} - -} // namespace cudnn + namespace cudnn { + + template class dataType; + template<> class dataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_FLOAT; + static float oneval, zeroval; + static const void *one, *zero; + }; + template<> class dataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_DOUBLE; + static double oneval, zeroval; + static const void *one, *zero; + }; + + template + inline void createTensor4dDesc(cudnnTensorDescriptor_t* desc) { + CUDNN_CHECK(cudnnCreateTensorDescriptor(desc)); + } + + template + inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, + int n, int c, int h, int w, + int stride_n, int stride_c, int stride_h, int stride_w) { + CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(*desc, dataType::type, + n, c, h, w, stride_n, stride_c, stride_h, stride_w)); + } + + template + inline void setTensor4dDesc(cudnnTensorDescriptor_t* desc, + int n, int c, int h, int w) { + const int stride_w = 1; + const int stride_h = w * stride_w; + const int stride_c = h * stride_h; + const int stride_n = c * stride_c; + setTensor4dDesc(desc, n, c, h, w, + stride_n, stride_c, stride_h, stride_w); + } + + template + inline void createFilterDesc(cudnnFilterDescriptor_t* desc, + int n, int c, int h, int w) { + CUDNN_CHECK(cudnnCreateFilterDescriptor(desc)); + CUDNN_CHECK(cudnnSetFilter4dDescriptor(*desc, dataType::type, + n, c, h, w)); + } + + template + inline void createConvolutionDesc(cudnnConvolutionDescriptor_t* conv) { + CUDNN_CHECK(cudnnCreateConvolutionDescriptor(conv)); + } + + template + inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv, + cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter, + int pad_h, int pad_w, int stride_h, int stride_w) { + CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv, + pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION)); + } + + template + inline void createPoolingDesc(cudnnPoolingDescriptor_t* pool_desc, + PoolingParameter_PoolMethod poolmethod, cudnnPoolingMode_t* mode, + int h, int w, int pad_h, int pad_w, int stride_h, int stride_w) { + switch (poolmethod) { + case PoolingParameter_PoolMethod_MAX: + *mode = CUDNN_POOLING_MAX; + break; + case PoolingParameter_PoolMethod_AVE: + *mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } + CUDNN_CHECK(cudnnCreatePoolingDescriptor(pool_desc)); + CUDNN_CHECK(cudnnSetPooling2dDescriptor(*pool_desc, *mode, h, w, + pad_h, pad_w, stride_h, stride_w)); + } + + } // namespace cudnn } // namespace caffe diff --git a/include/caffe/util/db.hpp b/include/caffe/util/db.hpp index 59ec3d39..a872fb07 100644 --- a/include/caffe/util/db.hpp +++ b/include/caffe/util/db.hpp @@ -6,43 +6,52 @@ #include "caffe/common.hpp" #include "caffe/proto/caffe.pb.h" -namespace caffe { namespace db { +namespace caffe { +namespace db { -enum Mode { READ, WRITE, NEW }; +enum Mode { + READ, WRITE, NEW +}; class Cursor { - public: - Cursor() { } - virtual ~Cursor() { } - virtual void SeekToFirst() = 0; - virtual void Next() = 0; - virtual string key() = 0; - virtual string value() = 0; - virtual bool valid() = 0; - - DISABLE_COPY_AND_ASSIGN(Cursor); + public: + Cursor() { + } + virtual ~Cursor() { + } + virtual void SeekToFirst() = 0; + virtual void Next() = 0; + virtual string key() = 0; + virtual string value() = 0; + virtual bool valid() = 0; + + DISABLE_COPY_AND_ASSIGN (Cursor); }; class Transaction { - public: - Transaction() { } - virtual ~Transaction() { } - virtual void Put(const string& key, const string& value) = 0; - virtual void Commit() = 0; - - DISABLE_COPY_AND_ASSIGN(Transaction); + public: + Transaction() { + } + virtual ~Transaction() { + } + virtual void Put(const string& key, const string& value) = 0; + virtual void Commit() = 0; + + DISABLE_COPY_AND_ASSIGN (Transaction); }; class DB { - public: - DB() { } - virtual ~DB() { } - virtual void Open(const string& source, Mode mode) = 0; - virtual void Close() = 0; - virtual Cursor* NewCursor() = 0; - virtual Transaction* NewTransaction() = 0; - - DISABLE_COPY_AND_ASSIGN(DB); + public: + DB() { + } + virtual ~DB() { + } + virtual void Open(const string& source, Mode mode) = 0; + virtual void Close() = 0; + virtual Cursor* NewCursor() = 0; + virtual Transaction* NewTransaction() = 0; + + DISABLE_COPY_AND_ASSIGN (DB); }; DB* GetDB(DataParameter::DB backend); diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp index 10623554..c0f6ab62 100644 --- a/include/caffe/util/db_leveldb.hpp +++ b/include/caffe/util/db_leveldb.hpp @@ -8,65 +8,86 @@ #include "caffe/util/db.hpp" -namespace caffe { namespace db { +namespace caffe { +namespace db { -class LevelDBCursor : public Cursor { - public: - explicit LevelDBCursor(leveldb::Iterator* iter) - : iter_(iter) { SeekToFirst(); } - ~LevelDBCursor() { delete iter_; } - virtual void SeekToFirst() { iter_->SeekToFirst(); } - virtual void Next() { iter_->Next(); } - virtual string key() { return iter_->key().ToString(); } - virtual string value() { return iter_->value().ToString(); } - virtual bool valid() { return iter_->Valid(); } +class LevelDBCursor: public Cursor { + public: + explicit LevelDBCursor(leveldb::Iterator* iter) + : iter_(iter) { + SeekToFirst(); + } + ~LevelDBCursor() { + delete iter_; + } + virtual void SeekToFirst() { + iter_->SeekToFirst(); + } + virtual void Next() { + iter_->Next(); + } + virtual string key() { + return iter_->key().ToString(); + } + virtual string value() { + return iter_->value().ToString(); + } + virtual bool valid() { + return iter_->Valid(); + } - private: - leveldb::Iterator* iter_; + private: + leveldb::Iterator* iter_; }; -class LevelDBTransaction : public Transaction { - public: - explicit LevelDBTransaction(leveldb::DB* db) : db_(db) { CHECK_NOTNULL(db_); } - virtual void Put(const string& key, const string& value) { - batch_.Put(key, value); - } - virtual void Commit() { - leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_); - CHECK(status.ok()) << "Failed to write batch to leveldb " - << std::endl << status.ToString(); - } +class LevelDBTransaction: public Transaction { + public: + explicit LevelDBTransaction(leveldb::DB* db) + : db_(db) { + CHECK_NOTNULL(db_); + } + virtual void Put(const string& key, const string& value) { + batch_.Put(key, value); + } + virtual void Commit() { + leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_); + CHECK(status.ok()) << "Failed to write batch to leveldb " << std::endl + << status.ToString(); + } - private: - leveldb::DB* db_; - leveldb::WriteBatch batch_; + private: + leveldb::DB* db_; + leveldb::WriteBatch batch_; - DISABLE_COPY_AND_ASSIGN(LevelDBTransaction); + DISABLE_COPY_AND_ASSIGN (LevelDBTransaction); }; -class LevelDB : public DB { - public: - LevelDB() : db_(NULL) { } - virtual ~LevelDB() { Close(); } - virtual void Open(const string& source, Mode mode); - virtual void Close() { - if (db_ != NULL) { - delete db_; - db_ = NULL; +class LevelDB: public DB { + public: + LevelDB() + : db_(NULL) { + } + virtual ~LevelDB() { + Close(); + } + virtual void Open(const string& source, Mode mode); + virtual void Close() { + if (db_ != NULL) { + delete db_; + db_ = NULL; + } + } + virtual LevelDBCursor* NewCursor() { + return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions())); + } + virtual LevelDBTransaction* NewTransaction() { + return new LevelDBTransaction(db_); } - } - virtual LevelDBCursor* NewCursor() { - return new LevelDBCursor(db_->NewIterator(leveldb::ReadOptions())); - } - virtual LevelDBTransaction* NewTransaction() { - return new LevelDBTransaction(db_); - } - private: - leveldb::DB* db_; + private: + leveldb::DB* db_; }; - } // namespace db } // namespace caffe diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp index cc7c90af..232b439a 100644 --- a/include/caffe/util/db_lmdb.hpp +++ b/include/caffe/util/db_lmdb.hpp @@ -7,82 +7,97 @@ #include "caffe/util/db.hpp" -namespace caffe { namespace db { +namespace caffe { +namespace db { inline void MDB_CHECK(int mdb_status) { CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status); } -class LMDBCursor : public Cursor { - public: - explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor) - : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) { - SeekToFirst(); - } - virtual ~LMDBCursor() { - mdb_cursor_close(mdb_cursor_); - mdb_txn_abort(mdb_txn_); - } - virtual void SeekToFirst() { Seek(MDB_FIRST); } - virtual void Next() { Seek(MDB_NEXT); } - virtual string key() { - return string(static_cast(mdb_key_.mv_data), mdb_key_.mv_size); - } - virtual string value() { - return string(static_cast(mdb_value_.mv_data), - mdb_value_.mv_size); - } - virtual bool valid() { return valid_; } +class LMDBCursor: public Cursor { + public: + explicit LMDBCursor(MDB_txn* mdb_txn, MDB_cursor* mdb_cursor) + : mdb_txn_(mdb_txn), mdb_cursor_(mdb_cursor), valid_(false) { + SeekToFirst(); + } + virtual ~LMDBCursor() { + mdb_cursor_close(mdb_cursor_); + mdb_txn_abort(mdb_txn_); + } + virtual void SeekToFirst() { + Seek (MDB_FIRST); + } + virtual void Next() { + Seek (MDB_NEXT); + } + virtual string key() { + return string(static_cast(mdb_key_.mv_data), + mdb_key_.mv_size); + } + virtual string value() { + return string(static_cast(mdb_value_.mv_data), + mdb_value_.mv_size); + } + virtual bool valid() { + return valid_; + } - private: - void Seek(MDB_cursor_op op) { - int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op); - if (mdb_status == MDB_NOTFOUND) { - valid_ = false; - } else { - MDB_CHECK(mdb_status); - valid_ = true; + private: + void Seek(MDB_cursor_op op) { + int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op); + if (mdb_status == MDB_NOTFOUND) { + valid_ = false; + } else { + MDB_CHECK(mdb_status); + valid_ = true; + } } - } - MDB_txn* mdb_txn_; - MDB_cursor* mdb_cursor_; - MDB_val mdb_key_, mdb_value_; - bool valid_; + MDB_txn* mdb_txn_; + MDB_cursor* mdb_cursor_; + MDB_val mdb_key_, mdb_value_; + bool valid_; }; -class LMDBTransaction : public Transaction { - public: - explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn) - : mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) { } - virtual void Put(const string& key, const string& value); - virtual void Commit() { MDB_CHECK(mdb_txn_commit(mdb_txn_)); } +class LMDBTransaction: public Transaction { + public: + explicit LMDBTransaction(MDB_dbi* mdb_dbi, MDB_txn* mdb_txn) + : mdb_dbi_(mdb_dbi), mdb_txn_(mdb_txn) { + } + virtual void Put(const string& key, const string& value); + virtual void Commit() { + MDB_CHECK(mdb_txn_commit(mdb_txn_)); + } - private: - MDB_dbi* mdb_dbi_; - MDB_txn* mdb_txn_; + private: + MDB_dbi* mdb_dbi_; + MDB_txn* mdb_txn_; - DISABLE_COPY_AND_ASSIGN(LMDBTransaction); + DISABLE_COPY_AND_ASSIGN (LMDBTransaction); }; -class LMDB : public DB { - public: - LMDB() : mdb_env_(NULL) { } - virtual ~LMDB() { Close(); } - virtual void Open(const string& source, Mode mode); - virtual void Close() { - if (mdb_env_ != NULL) { - mdb_dbi_close(mdb_env_, mdb_dbi_); - mdb_env_close(mdb_env_); - mdb_env_ = NULL; +class LMDB: public DB { + public: + LMDB() + : mdb_env_(NULL) { + } + virtual ~LMDB() { + Close(); + } + virtual void Open(const string& source, Mode mode); + virtual void Close() { + if (mdb_env_ != NULL) { + mdb_dbi_close(mdb_env_, mdb_dbi_); + mdb_env_close(mdb_env_); + mdb_env_ = NULL; + } } - } - virtual LMDBCursor* NewCursor(); - virtual LMDBTransaction* NewTransaction(); + virtual LMDBCursor* NewCursor(); + virtual LMDBTransaction* NewTransaction(); - private: - MDB_env* mdb_env_; - MDB_dbi mdb_dbi_; + private: + MDB_env* mdb_env_; + MDB_dbi mdb_dbi_; }; } // namespace db diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp index 6ea595db..bf5d7705 100644 --- a/include/caffe/util/device_alternate.hpp +++ b/include/caffe/util/device_alternate.hpp @@ -31,70 +31,11 @@ void classname::funcname##_##gpu(const vector*>& top, \ #else // Normal GPU + CPU Caffe. -#include -#include -#include -#include -#include // cuda driver types #ifdef USE_CUDNN // cuDNN acceleration library. #include "caffe/util/cudnn.hpp" #endif -// -// CUDA macros -// - -// CUDA: various checks for different function calls. -#define CUDA_CHECK(condition) \ - /* Code block avoids redefinition of cudaError_t error */ \ - do { \ - cudaError_t error = condition; \ - CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ - } while (0) - -#define CUBLAS_CHECK(condition) \ - do { \ - cublasStatus_t status = condition; \ - CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \ - << caffe::cublasGetErrorString(status); \ - } while (0) - -#define CURAND_CHECK(condition) \ - do { \ - curandStatus_t status = condition; \ - CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \ - << caffe::curandGetErrorString(status); \ - } while (0) - -// CUDA: grid stride looping -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ - i < (n); \ - i += blockDim.x * gridDim.x) - -// CUDA: check for error after kernel execution and exit loudly if there is one. -#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) - namespace caffe { - -// CUDA: library error reporting. -const char* cublasGetErrorString(cublasStatus_t error); -const char* curandGetErrorString(curandStatus_t error); - -// CUDA: thread number configuration. -// Use 1024 threads per block, which requires cuda sm_2x or above, -// or fall back to attempt compatibility (best of luck to you). -#if __CUDA_ARCH__ >= 200 - const int CAFFE_CUDA_NUM_THREADS = 1024; -#else - const int CAFFE_CUDA_NUM_THREADS = 512; -#endif - -// CUDA: number of blocks for threads. -inline int CAFFE_GET_BLOCKS(const int N) { - return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS; -} - } // namespace caffe #endif // CPU_ONLY diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp index 0051e2fa..9c6de363 100644 --- a/include/caffe/util/im2col.hpp +++ b/include/caffe/util/im2col.hpp @@ -1,32 +1,74 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #ifndef _CAFFE_UTIL_IM2COL_HPP_ #define _CAFFE_UTIL_IM2COL_HPP_ namespace caffe { template -void im2col_cpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_col); +void im2col_cpu(const Dtype* data_im, const int channels, const int height, + const int width, const int kernel_h, const int kernel_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_col); + +template +void col2im_cpu(const Dtype* data_col, const int channels, const int height, + const int width, const int patch_h, const int patch_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_im); +#ifndef CPU_ONLY template -void col2im_cpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im); +void col2im_gpu(const Dtype* data_col, const int col_offset, const int height, + const int width, const int channels, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + Dtype* data_im, const int img_offset); template -void im2col_gpu(const Dtype* data_im, const int channels, +void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_col); + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + Dtype* data_col, const int col_offset); template -void col2im_gpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im); +void im2col_gpu_opt(const Dtype* data_im, const int img_offset, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col, const int col_offset, + int optnum); +template +void col2im_gpu(cl_kernel Kernel, const Dtype* data_col, const int col_offset, + const int channels, const int height, const int width, const int psize, + const int pad, const int stride, Dtype* data_im, const int img_offset); + +template +void col2im_gpu_opt(const Dtype* data_col, const int col_offset, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im, const int img_offset, + int optnum); +#endif } // namespace caffe #endif // CAFFE_UTIL_IM2COL_HPP_ diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp index 3a62c3c9..c04cce6a 100644 --- a/include/caffe/util/io.hpp +++ b/include/caffe/util/io.hpp @@ -38,8 +38,8 @@ inline void MakeTempDir(string* temp_dirname) { // NOLINT_NEXT_LINE(runtime/printf) strcpy(temp_dirname_cstr, temp_dirname->c_str()); char* mkdtemp_result = mkdtemp(temp_dirname_cstr); - CHECK(mkdtemp_result != NULL) - << "Failed to create a temporary directory at: " << *temp_dirname; + CHECK(mkdtemp_result != NULL) << "Failed to create a temporary directory at: " + << *temp_dirname; *temp_dirname = temp_dirname_cstr; delete[] temp_dirname_cstr; } @@ -74,14 +74,13 @@ inline void ReadProtoFromBinaryFileOrDie(const char* filename, Message* proto) { } inline void ReadProtoFromBinaryFileOrDie(const string& filename, - Message* proto) { + Message* proto) { ReadProtoFromBinaryFileOrDie(filename.c_str(), proto); } - void WriteProtoToBinaryFile(const Message& proto, const char* filename); -inline void WriteProtoToBinaryFile( - const Message& proto, const string& filename) { +inline void WriteProtoToBinaryFile(const Message& proto, + const string& filename) { WriteProtoToBinaryFile(proto, filename.c_str()); } @@ -91,14 +90,13 @@ inline bool ReadFileToDatum(const string& filename, Datum* datum) { return ReadFileToDatum(filename, -1, datum); } -bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, const bool is_color, - const std::string & encoding, Datum* datum); +bool ReadImageToDatum(const string& filename, const int label, const int height, + const int width, const bool is_color, const std::string & encoding, + Datum* datum); inline bool ReadImageToDatum(const string& filename, const int label, const int height, const int width, const bool is_color, Datum* datum) { - return ReadImageToDatum(filename, label, height, width, is_color, - "", datum); + return ReadImageToDatum(filename, label, height, width, is_color, "", datum); } inline bool ReadImageToDatum(const string& filename, const int label, @@ -124,14 +122,13 @@ inline bool ReadImageToDatum(const string& filename, const int label, bool DecodeDatumNative(Datum* datum); bool DecodeDatum(Datum* datum, bool is_color); -cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width, const bool is_color); +cv::Mat ReadImageToCVMat(const string& filename, const int height, + const int width, const bool is_color); -cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width); +cv::Mat ReadImageToCVMat(const string& filename, const int height, + const int width); -cv::Mat ReadImageToCVMat(const string& filename, - const bool is_color); +cv::Mat ReadImageToCVMat(const string& filename, const bool is_color); cv::Mat ReadImageToCVMat(const string& filename); @@ -141,18 +138,16 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color); void CVMatToDatum(const cv::Mat& cv_img, Datum* datum); template -void hdf5_load_nd_dataset_helper( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob); +void hdf5_load_nd_dataset_helper(hid_t file_id, const char* dataset_name_, + int min_dim, int max_dim, Blob* blob); template -void hdf5_load_nd_dataset( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob); +void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, int min_dim, + int max_dim, Blob* blob); template -void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob); +void hdf5_save_nd_dataset(const hid_t file_id, const string& dataset_name, + const Blob& blob); } // namespace caffe diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 2cacd8e7..4ca1fac0 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -1,24 +1,75 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_ #define CAFFE_UTIL_MATH_FUNCTIONS_H_ #include #include // for std::fabs and std::signbit - +#include +#include #include "glog/logging.h" -#include "caffe/common.hpp" -#include "caffe/util/device_alternate.hpp" #include "caffe/util/mkl_alternate.hpp" +#include "caffe/util/ocl_util.hpp" namespace caffe { -// Caffe gemm provides a simpler interface to the gemm functions, with the +// Decaf gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. template -void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, +void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, const Dtype* A, + const Dtype* B, const Dtype beta, Dtype* C); + +// Decaf gpu gemm provides an interface that is almost the same as the cpu +// gemm function - following the c convention and calling the fortran-order +// gpu code under the hood. +template +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const Dtype alpha, const Dtype* A, + const Dtype* B, const Dtype beta, Dtype* C); + +template +cl_event caffe_gpu_gemm(cl_command_queue *queue, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, + const int offB, const Dtype beta, Dtype* C, const int offC); +/*This is Yuan Gao's sgemm_ex*/ +template +void caffe_gpu_exgemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C); + Dtype* C, const int offset1, const int offset2, const int offset3); + +template +cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const int offA, const Dtype* B, + const int offB, const Dtype beta, Dtype* C, const int offC); template void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, @@ -26,29 +77,75 @@ void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, Dtype* y); template -void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y); +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, size_t offA, int lda, const Dtype * x, + size_t offx, const Dtype beta, int incx, Dtype* y, size_t offy, int incy); + +template +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); + +template +void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); + +template +void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); template void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X, const Dtype beta, Dtype* Y); +template +void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + template void caffe_copy(const int N, const Dtype *X, Dtype *Y); template void caffe_set(const int N, const Dtype alpha, Dtype *X); +template +void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X, const int offset=0); + inline void caffe_memset(const size_t N, const int alpha, void* X) { memset(X, alpha, N); // NOLINT(caffe/alt_fn) } +inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { +#ifndef CPU_ONLY + ocl_memset((int*) X, (alpha << 24) | (alpha << 16) | (alpha << 8) | alpha, N); +#endif +} + +void caffe_gpu_memcpy(const size_t N, const void *X, void *Y); + +template +void caffe_gpu_memcpy(const size_t N, const Dtype *X, Dtype *Y); + +template +void caffe_gpu_copy(const int N, const Dtype *X, Dtype *Y); + +template +void caffe_gpu_copy(const int N, const Dtype* X, const int offx, Dtype* Y, const int offy); + template void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); +template +void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_add_scalar(cl_kernel kernel, const int N, const Dtype alpha, + Dtype *X); + template void caffe_scal(const int N, const Dtype alpha, Dtype *X); +template +void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X, const int offx = 0); + template void caffe_sqr(const int N, const Dtype* a, Dtype* y); @@ -61,12 +158,27 @@ void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); template void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); +template +void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + template void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); +//CUDA version, need to be deleted +template +void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_gpu_div(cl_kernel kernel, const int N, const Dtype* a, + const Dtype* b, Dtype* y); + template void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); +//CUDA version, need to be deleted +template +void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); + unsigned int caffe_rng_rand(); template @@ -75,9 +187,25 @@ Dtype caffe_nextafter(const Dtype b); template void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); +// caffe_gpu_rng_uniform with two arguments generates integers in the range +// [0, UINT_MAX]. +void caffe_gpu_rng_uniform(const int n, unsigned int* r); + +// caffe_gpu_rng_uniform with four arguments generates floats in the range +// (a, b] (strictly greater than a, less than or equal to b) due to the +// specification of curandGenerateUniform. With a = 0, b = 1, just calls +// curandGenerateUniform; with other limits will shift and scale the outputs +// appropriately after calling curandGenerateUniform. +template +void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); + template void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, - Dtype* r); + Dtype* r); + +template +void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, + Dtype* r); template void caffe_rng_bernoulli(const int n, const Dtype p, int* r); @@ -86,32 +214,41 @@ template void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r); template -void caffe_exp(const int n, const Dtype* a, Dtype* y); +void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r); template -void caffe_log(const int n, const Dtype* a, Dtype* y); +void caffe_exp(const int n, const Dtype* a, Dtype* y); template -void caffe_abs(const int n, const Dtype* a, Dtype* y); +Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y); template -Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y); +void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out); template -Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx, - const Dtype* y, const int incy); +void caffe_gpu_dot(const int n, const Dtype* x, size_t offx, const Dtype* y, size_t offy, Dtype* out); template int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y); +template +uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x, + const Dtype* y); + // Returns the sum of the absolute values of the elements of vector x template Dtype caffe_cpu_asum(const int n, const Dtype* x); +template +void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y); + +template +void caffe_gpu_asum(const int n, const Dtype* x, size_t offx, Dtype* y); + // the branchless, type-safe version from // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c -template -inline int8_t caffe_sign(Dtype val) { +template +inline char caffe_sign(Dtype val) { return (Dtype(0) < val) - (val < Dtype(0)); } @@ -130,63 +267,54 @@ inline int8_t caffe_sign(Dtype val) { } \ } -// output is 1 for the positives, 0 for zero, and -1 for the negatives -DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign(x[i])); - -// This returns a nonzero value if the input has its sign bit set. -// The name sngbit is meant to avoid conflicts with std::signbit in the macro. -// The extra parens are needed because CUDA < 6.5 defines signbit as a macro, -// and we don't want that to expand here when CUDA headers are also included. -DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \ - y[i] = static_cast((std::signbit)(x[i]))); - -DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i])); +#define INSTANTIATE_CAFFE_CPU_UNARY_FUNC(name) \ + template <> \ + void caffe_cpu_##name(const int n, const float* x, float* y); \ + template <> \ + void caffe_cpu_##name(const int n, const double* x, double* y) -template -void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); +#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \ +template \ +void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ + operation; \ +} \ +template <> \ +void caffe_gpu_##name(const int n, const float* x, float* y) { \ +} \ +template <> \ +void caffe_gpu_##name(const int n, const double* x, double* y) { \ +} -#ifndef CPU_ONLY // GPU +// output is 1 for the positives, 0 for zero, and -1 for the negatives +DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign(x[i])); -// Decaf gpu gemm provides an interface that is almost the same as the cpu -// gemm function - following the c convention and calling the fortran-order -// gpu code under the hood. template -void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, - const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, - Dtype* C); +void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y); template -void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, - Dtype* y); +void caffe_gpu_sign(const int N, const Dtype *X, const int offx, Dtype *Y, const int offy); -template -void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, - Dtype* Y); +// This returns a nonzero value if the input has its sign bit set. +// The name sngbit is meant to avoid conflicts with std::signbit in the macro +using std::signbit; +DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, y[i] = signbit(x[i])); template -void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X, - const Dtype beta, Dtype* Y); +void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y); -void caffe_gpu_memcpy(const size_t N, const void *X, void *Y); +DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i])); template -void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); +void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y); -inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaMemset(X, alpha, N)); // NOLINT(caffe/alt_fn) -#else - NO_GPU; -#endif -} +template +void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); template -void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); +void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); template -void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X); +void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, const int offx, Dtype* y, const int offy); template void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); @@ -212,69 +340,18 @@ void caffe_gpu_log(const int n, const Dtype* a, Dtype* y); template void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); -// caffe_gpu_rng_uniform with two arguments generates integers in the range -// [0, UINT_MAX]. -void caffe_gpu_rng_uniform(const int n, unsigned int* r); - -// caffe_gpu_rng_uniform with four arguments generates floats in the range -// (a, b] (strictly greater than a, less than or equal to b) due to the -// specification of curandGenerateUniform. With a = 0, b = 1, just calls -// curandGenerateUniform; with other limits will shift and scale the outputs -// appropriately after calling curandGenerateUniform. -template -void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); - template -void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, - Dtype* r); - -template -void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r); - -template -void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out); - -template -uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x, - const Dtype* y); +void caffe_exp(const int n, const Dtype* a, Dtype* y); template -void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y); - -template -void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y); - -template -void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y); +void caffe_abs(const int n, const Dtype* a, Dtype* y); template -void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y); +void caffe_log(const int n, const Dtype* a, Dtype* y); template -void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); - -#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \ -template \ -__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ - CUDA_KERNEL_LOOP(index, n) { \ - operation; \ - } \ -} \ -template <> \ -void caffe_gpu_##name(const int n, const float* x, float* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - n, x, y); \ -} \ -template <> \ -void caffe_gpu_##name(const int n, const double* x, double* y) { \ - /* NOLINT_NEXT_LINE(whitespace/operators) */ \ - name##_kernel<<>>( \ - n, x, y); \ -} - -#endif // !CPU_ONLY - +Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx, + const Dtype* y, const int incy); } // namespace caffe #endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ diff --git a/include/caffe/util/math_functions.hpp.protect b/include/caffe/util/math_functions.hpp.protect new file mode 100644 index 00000000..2cacd8e7 --- /dev/null +++ b/include/caffe/util/math_functions.hpp.protect @@ -0,0 +1,280 @@ +#ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_ +#define CAFFE_UTIL_MATH_FUNCTIONS_H_ + +#include +#include // for std::fabs and std::signbit + +#include "glog/logging.h" + +#include "caffe/common.hpp" +#include "caffe/util/device_alternate.hpp" +#include "caffe/util/mkl_alternate.hpp" + +namespace caffe { + +// Caffe gemm provides a simpler interface to the gemm functions, with the +// limitation that the data has to be contiguous in memory. +template +void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); + +template +void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); + +template +void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y); + +template +void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + +template +void caffe_copy(const int N, const Dtype *X, Dtype *Y); + +template +void caffe_set(const int N, const Dtype alpha, Dtype *X); + +inline void caffe_memset(const size_t N, const int alpha, void* X) { + memset(X, alpha, N); // NOLINT(caffe/alt_fn) +} + +template +void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_scal(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_sqr(const int N, const Dtype* a, Dtype* y); + +template +void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); + +unsigned int caffe_rng_rand(); + +template +Dtype caffe_nextafter(const Dtype b); + +template +void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); + +template +void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, + Dtype* r); + +template +void caffe_rng_bernoulli(const int n, const Dtype p, int* r); + +template +void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r); + +template +void caffe_exp(const int n, const Dtype* a, Dtype* y); + +template +void caffe_log(const int n, const Dtype* a, Dtype* y); + +template +void caffe_abs(const int n, const Dtype* a, Dtype* y); + +template +Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y); + +template +Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx, + const Dtype* y, const int incy); + +template +int caffe_cpu_hamming_distance(const int n, const Dtype* x, const Dtype* y); + +// Returns the sum of the absolute values of the elements of vector x +template +Dtype caffe_cpu_asum(const int n, const Dtype* x); + +// the branchless, type-safe version from +// http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c +template +inline int8_t caffe_sign(Dtype val) { + return (Dtype(0) < val) - (val < Dtype(0)); +} + +// The following two macros are modifications of DEFINE_VSL_UNARY_FUNC +// in include/caffe/util/mkl_alternate.hpp authored by @Rowland Depp. +// Please refer to commit 7e8ef25c7 of the boost-eigen branch. +// Git cherry picking that commit caused a conflict hard to resolve and +// copying that file in convenient for code reviewing. +// So they have to be pasted here temporarily. +#define DEFINE_CAFFE_CPU_UNARY_FUNC(name, operation) \ + template \ + void caffe_cpu_##name(const int n, const Dtype* x, Dtype* y) { \ + CHECK_GT(n, 0); CHECK(x); CHECK(y); \ + for (int i = 0; i < n; ++i) { \ + operation; \ + } \ + } + +// output is 1 for the positives, 0 for zero, and -1 for the negatives +DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign(x[i])); + +// This returns a nonzero value if the input has its sign bit set. +// The name sngbit is meant to avoid conflicts with std::signbit in the macro. +// The extra parens are needed because CUDA < 6.5 defines signbit as a macro, +// and we don't want that to expand here when CUDA headers are also included. +DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \ + y[i] = static_cast((std::signbit)(x[i]))); + +DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i])); + +template +void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); + +#ifndef CPU_ONLY // GPU + +// Decaf gpu gemm provides an interface that is almost the same as the cpu +// gemm function - following the c convention and calling the fortran-order +// gpu code under the hood. +template +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, + Dtype* C); + +template +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, + Dtype* y); + +template +void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, + Dtype* Y); + +template +void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X, + const Dtype beta, Dtype* Y); + +void caffe_gpu_memcpy(const size_t N, const void *X, void *Y); + +template +void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); + +inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { +#ifndef CPU_ONLY + CUDA_CHECK(cudaMemset(X, alpha, N)); // NOLINT(caffe/alt_fn) +#else + NO_GPU; +#endif +} + +template +void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X); + +template +void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_gpu_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); + +template +void caffe_gpu_abs(const int n, const Dtype* a, Dtype* y); + +template +void caffe_gpu_exp(const int n, const Dtype* a, Dtype* y); + +template +void caffe_gpu_log(const int n, const Dtype* a, Dtype* y); + +template +void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); + +// caffe_gpu_rng_uniform with two arguments generates integers in the range +// [0, UINT_MAX]. +void caffe_gpu_rng_uniform(const int n, unsigned int* r); + +// caffe_gpu_rng_uniform with four arguments generates floats in the range +// (a, b] (strictly greater than a, less than or equal to b) due to the +// specification of curandGenerateUniform. With a = 0, b = 1, just calls +// curandGenerateUniform; with other limits will shift and scale the outputs +// appropriately after calling curandGenerateUniform. +template +void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); + +template +void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, + Dtype* r); + +template +void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r); + +template +void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out); + +template +uint32_t caffe_gpu_hamming_distance(const int n, const Dtype* x, + const Dtype* y); + +template +void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y); + +template +void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y); + +template +void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y); + +template +void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y); + +template +void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); + +#define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \ +template \ +__global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ + CUDA_KERNEL_LOOP(index, n) { \ + operation; \ + } \ +} \ +template <> \ +void caffe_gpu_##name(const int n, const float* x, float* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + n, x, y); \ +} \ +template <> \ +void caffe_gpu_##name(const int n, const double* x, double* y) { \ + /* NOLINT_NEXT_LINE(whitespace/operators) */ \ + name##_kernel<<>>( \ + n, x, y); \ +} + +#endif // !CPU_ONLY + +} // namespace caffe + +#endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp index 3355b665..2ca24374 100644 --- a/include/caffe/util/mkl_alternate.hpp +++ b/include/caffe/util/mkl_alternate.hpp @@ -81,14 +81,12 @@ DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]); // in standard blas. We will simply use a two-step (inefficient, of course) way // to mimic that. inline void cblas_saxpby(const int N, const float alpha, const float* X, - const int incX, const float beta, float* Y, - const int incY) { + const int incX, const float beta, float* Y, const int incY) { cblas_sscal(N, beta, Y, incY); cblas_saxpy(N, alpha, X, incX, Y, incY); } inline void cblas_daxpby(const int N, const double alpha, const double* X, - const int incX, const double beta, double* Y, - const int incY) { + const int incX, const double beta, double* Y, const int incY) { cblas_dscal(N, beta, Y, incY); cblas_daxpy(N, alpha, X, incX, Y, incY); } diff --git a/include/caffe/util/ocl_util.hpp b/include/caffe/util/ocl_util.hpp new file mode 100644 index 00000000..3027019f --- /dev/null +++ b/include/caffe/util/ocl_util.hpp @@ -0,0 +1,41 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +#ifndef _CAFFE_UTIL_OCL_UTIL_HPP_ +#define _CAFFE_UTIL_OCL_UTIL_HPP_ + +namespace caffe { +#ifndef CPU_ONLY +template +void ocl_memset(Dtype* buffer, const Dtype value, const int count, const int buf_offset = 0); + +void ocl_memset(cl_mem buffer, const int value, const int count); + +void eventCallback(cl_event event, cl_int event_status, void * user_data); +#endif +} // namespace caffe + +#endif // CAFFE_UTIL_OCL_UTIL_HPP_ diff --git a/include/caffe/util/ocl_wrapper.hpp b/include/caffe/util/ocl_wrapper.hpp new file mode 100644 index 00000000..0ce3a184 --- /dev/null +++ b/include/caffe/util/ocl_wrapper.hpp @@ -0,0 +1,358 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +#ifndef _CAFFE_UTIL_OCL_WRAPPER_HPP_ +#define _CAFFE_UTIL_OCL_WRAPPER_HPP_ + +namespace caffe { + +typedef unsigned int uint32_t; + +template inline std::string get_dtype_suffix() { + dtype x; + const char type = typeid(x).name()[0]; + std::string suffix; + switch (type) { + case 'i': + suffix = "_int"; + break; + case 'd': + suffix = "_double"; + break; + case 'f': + default: + suffix = "_float"; + } + return suffix; +} + +#ifndef CPU_ONLY +template +void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, + const int M_, const int packing_num); + +template +void opttrans(const Dtype* data_im, const int im_offset, const int channels, + const int height, const int width, Dtype* data_opt, const int opt_offset, + const int optnum); + +template +void get_max_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* bottom_data, Dtype* scale_data); + +template +void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out); + +template +void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* scale, Dtype* data); + +template +Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* prob_data, const Dtype* label, cl_mem d_loss); + +template +void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data); + +template +void diff_gpu(cl_kernel Kernel, const int num, const int dim, Dtype* data, + const Dtype* label); + +template +void max_pool_fp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + Dtype* top_data); + +template +void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, + Dtype* top_mask); + +template +void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, + const int* const mask, const Dtype* const top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff); + +template +void AvePoolBackward(const int nthreads, const Dtype* const top_diff, + const int num, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff); + +template +void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, Dtype* const bottom_diff); +template +void SigmoidForward(const int count, const Dtype* bottom_data, Dtype* top_data); + +template +void SigmoidBackward(const int count, const Dtype* top_diff, + const Dtype* top_data, Dtype* bottom_diff); + +template +void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data); + +template +void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, + Dtype* bottom_diff); + +template +void ThresholdForward(const int count, const Dtype threshold, + const Dtype* bottom_data, Dtype* top_data); + +template +void ave_pool_fp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, Dtype* top_data); + +template +void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data); + +template +void StoPoolForwardTrain(const int count, const Dtype* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* idx_data, Dtype* top_data); + +template +void StoPoolForwardTest(const int count, const Dtype* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* top_data); + +template +void max_pool_bp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, Dtype* bottom_diff); + +template +void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, + const int clnum, const int channels_, const int intheight_, + const int width_, const int pooled_height_, const int pooled_width_, + const int kernel_size_, const int stride_, const int pad_, + Dtype* bottom_diff); + +template +void PReLUForward(const int count, const int channels, const int dim, + const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, + const int div_factor); + +template +void PReLUBackward(const int count, const int channels, const int dim, + const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, + const Dtype* slope_data, const int div_factor); + +template +void PReLUParamBackward(const int count, const Dtype* top_diff, + const int offset_out, const Dtype* bottom_data, const int offset_in, + Dtype* bottom_diff); + +template +void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, + Dtype negative_slope); + +template +void ReLUBackward(const int count, const Dtype* top_diff, + const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope); + +template +void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y); + +template +void DropoutForward(const int count, const Dtype* bottom_data, + const unsigned int* MaskMem, const unsigned int threshold, const float scale_, Dtype *top_data); + +template +void DropoutBackward(const int count, const Dtype* top_diff, const unsigned int* MaskMem, + const unsigned int threshold_, const float scale_, Dtype* bottom_diff); + +template +void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, + Dtype threshold); + +void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed = 0); + +template +void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup, unsigned int _seed = 0); + +template +void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V); + +template +void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y); + +template +void caffe_gpu_signbit(const int N, const Dtype* X, Dtype * Y); + +template +void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y); + +template +void caffe_gpu_sign_with_offset_ocl(const int N, const Dtype* X, const int offx, Dtype * Y, const int offy); + +template +void kernel_channel_max(const int num, const int channels, + const int spatial_dim, const Dtype* data, Dtype* out); + +template +void kernel_channel_subtract(const int count, const int num, const int channels, + const int spatial_dim, const Dtype* channel_max, Dtype* data); + +template +void kernel_powx(const int count, const Dtype* data, const Dtype alpha, + Dtype* out); + +template +void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out); + +template +void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out); + +template +void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out); + +template +void kernel_log(const int count, const Dtype* data, Dtype* out); + +template +void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out); + +template +void kernel_add_scalar(const int count, const Dtype data, Dtype* out); + +template +void kernel_exp(const int count, const Dtype* data, Dtype* out); + +template +void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, const Dtype* data, Dtype* channel_sum); + +template +void kernel_channel_div(const int count, const int num, const int channels, + const int spatial_dim, const Dtype* channel_sum, Dtype* data); + +template +void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, const Dtype* data_1, const Dtype* data_2, + Dtype* channel_dot); + +template +void SoftmaxLossForwardGPU(const int nthreads, const Dtype* prob_data, + const Dtype* label, Dtype* loss, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts); + +template +void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, + const Dtype* label, Dtype* bottom_diff, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts); + +template +void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y); + +template +void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data); + +template +void LRNFillScale(const int nthreads, const Dtype* const in, const int num, + const int channels, const int height, const int width, const int size, + const Dtype alpha_over_size, const Dtype k, Dtype* const scale); + +template +void LRNComputeOutput(int nthreads, const Dtype* in, Dtype* scale, + Dtype negative_beta, Dtype* out); + +template +void LRNComputeDiff(const int nthreads, const Dtype* const bottom_data, + const Dtype* const top_data, const Dtype* const scale, + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int size, + const Dtype negative_beta, const Dtype cache_ratio, + Dtype* const bottom_diff); +template +void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y); + +template +void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y); + +template +void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data); + +template +void BNLLBackward(const int count, const Dtype* top_diff, + const Dtype* bottom_data, Dtype *bottom_diff); + +template +void Concat(const int nthreads, const Dtype* in_data, const bool forward, + const int num_concats, const int concat_size, const int top_concat_axis, + const int bottom_concat_axis, const int offset_concat_axis, + Dtype *out_data); + +template +void CLLBackward(const int count, const int channels, const Dtype margin, + const bool legacy_version, const Dtype alpha, const Dtype* y, + const Dtype* diff, const Dtype* dist_sq, Dtype *bottom_diff); + +template +void MaxForward(const int nthreads, const Dtype* bottom_data_a, + const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, int* mask); + +template +void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx, + const int* mask, Dtype* bottom_diff); + +template +void Slice(const int nthreads, const Dtype* in_data, + const bool forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, Dtype* out_data); +#endif +} +#endif // CAFFE_UTIL_OCL_UTIL_HPP_ +// namespace caffe diff --git a/include/caffe/util/rng.hpp b/include/caffe/util/rng.hpp index 8f1cf0d1..febd932d 100644 --- a/include/caffe/util/rng.hpp +++ b/include/caffe/util/rng.hpp @@ -20,13 +20,13 @@ inline rng_t* caffe_rng() { // Fisher–Yates algorithm template inline void shuffle(RandomAccessIterator begin, RandomAccessIterator end, - RandomGenerator* gen) { - typedef typename std::iterator_traits::difference_type - difference_type; + RandomGenerator* gen) { + typedef typename std::iterator_traits::difference_type difference_type; typedef typename boost::uniform_int dist_type; difference_type length = std::distance(begin, end); - if (length <= 0) return; + if (length <= 0) + return; for (difference_type i = length - 1; i > 0; --i) { dist_type dist(0, i); diff --git a/include/caffe/util/upgrade_proto.hpp b/include/caffe/util/upgrade_proto.hpp index c1f21a0d..496ba1e0 100644 --- a/include/caffe/util/upgrade_proto.hpp +++ b/include/caffe/util/upgrade_proto.hpp @@ -23,11 +23,11 @@ bool UpgradeV0Net(const NetParameter& v0_net_param, NetParameter* net_param); // taking its top blob as input. // Error if any of these above layers are not-conv layers. void UpgradeV0PaddingLayers(const NetParameter& param, - NetParameter* param_upgraded_pad); + NetParameter* param_upgraded_pad); // Upgrade a single V0LayerConnection to the V1LayerParameter format. bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, - V1LayerParameter* layer_param); + V1LayerParameter* layer_param); V1LayerParameter_LayerType UpgradeV0LayerType(const string& type); @@ -46,7 +46,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param); bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param); bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, - LayerParameter* layer_param); + LayerParameter* layer_param); const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type); @@ -55,9 +55,9 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param); // Read parameters from a file into a NetParameter proto message. void ReadNetParamsFromTextFileOrDie(const string& param_file, - NetParameter* param); + NetParameter* param); void ReadNetParamsFromBinaryFileOrDie(const string& param_file, - NetParameter* param); + NetParameter* param); } // namespace caffe diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp index a6bd86a9..381b983b 100644 --- a/include/caffe/vision_layers.hpp +++ b/include/caffe/vision_layers.hpp @@ -21,94 +21,149 @@ namespace caffe { * ConvolutionLayer and DeconvolutionLayer. */ template -class BaseConvolutionLayer : public Layer { - public: - explicit BaseConvolutionLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline int MinBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 1; } - virtual inline bool EqualNumBottomTopBlobs() const { return true; } - - protected: - // Helper functions that abstract away the column buffer and gemm arguments. - // The last argument in forward_cpu_gemm is so that we can skip the im2col if - // we just called weight_cpu_gemm with the same input. - void forward_cpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); - void forward_cpu_bias(Dtype* output, const Dtype* bias); - void backward_cpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* output); - void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype* - weights); - void backward_cpu_bias(Dtype* bias, const Dtype* input); +class BaseConvolutionLayer: public Layer { + public: + explicit BaseConvolutionLayer(const LayerParameter& param) + : Layer(param) { + } + virtual ~BaseConvolutionLayer(); + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline int MinBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + virtual inline bool EqualNumBottomTopBlobs() const { + return true; + } + + protected: + // Helper functions that abstract away the column buffer and gemm arguments. + // The last argument in forward_cpu_gemm is so that we can skip the im2col if + // we just called weight_cpu_gemm with the same input. + void forward_cpu_gemm(const Dtype* input, const Dtype* weights, + Dtype* output, bool skip_im2col = false); + void forward_cpu_bias(Dtype* output, const Dtype* bias); + void backward_cpu_gemm(const Dtype* input, const Dtype* weights, + Dtype* output); + void weight_cpu_gemm(const Dtype* input, const Dtype* output, + Dtype* weights); + void backward_cpu_bias(Dtype* bias, const Dtype* input); + //opencl related setup + void ocl_setup(); #ifndef CPU_ONLY - void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights, - Dtype* output, bool skip_im2col = false); - void forward_gpu_bias(Dtype* output, const Dtype* bias); - void backward_gpu_gemm(const Dtype* input, const Dtype* weights, - Dtype* col_output); - void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype* - weights); - void backward_gpu_bias(Dtype* bias, const Dtype* input); + void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights, + Dtype* output, bool skip_im2col = false); + void forward_gpu_gemm_opt(const Dtype* col_input, const Dtype* weights, + Dtype* output, bool skip_im2col = false); + void forward_gpu_bias(Dtype* output, const Dtype* bias); + void forward_gpu_bias_opt(Dtype* output, const Dtype* bias); + void backward_gpu_gemm(const Dtype* input, const Dtype* weights, + Dtype* col_output); + void backward_gpu_gemm_opt(const Dtype* input, const Dtype* weights, + Dtype* col_output); + void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, + Dtype* weights); + void weight_gpu_gemm_opt(const Dtype* col_input, const Dtype* output, + Dtype* weights); + void backward_gpu_bias(Dtype* bias, const Dtype* input); #endif - // reverse_dimensions should return true iff we are implementing deconv, so - // that conv helpers know which dimensions are which. - virtual bool reverse_dimensions() = 0; - // Compute height_out_ and width_out_ from other parameters. - virtual void compute_output_shape() = 0; - - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int num_; - int channels_; - int pad_h_, pad_w_; - int height_, width_; - int group_; - int num_output_; - int height_out_, width_out_; - bool bias_term_; - bool is_1x1_; - - private: - // wrap im2col/col2im so we don't have to remember the (long) argument lists - inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { - im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); - } - inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { - col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); - } + // reverse_dimensions should return true iff we are implementing deconv, so + // that conv helpers know which dimensions are which. + virtual bool reverse_dimensions() = 0; + // Compute height_out_ and width_out_ from other parameters. + virtual void compute_output_shape() = 0; + + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; + int num_; + int channels_; + int pad_h_, pad_w_; + int height_, width_; + int group_; + int num_output_; + int height_out_, width_out_; + bool bias_term_; + bool is_1x1_; + + private: + // wrap im2col/col2im so we don't have to remember the (long) argument lists + inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { + im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); + } + inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { + col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); + } #ifndef CPU_ONLY - inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { - im2col_gpu(data, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff); - } - inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { - col2im_gpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data); - } + inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { + im2col_gpu(data, bottom_offset_, conv_in_channels_, conv_in_height_, + conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, + stride_w_, col_buff, 0); + } + inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { + col2im_gpu(col_buff, 0, conv_in_channels_, conv_in_height_, + conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, + stride_w_, data, bottom_offset_); + } + protected: + inline void conv_im2col_gpu_opt(const Dtype* data) { + im2col_gpu_opt(data, bottom_offset_, conv_in_channels_, conv_in_height_, + conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, + stride_w_, (Dtype*) transMem, 0, + opt_num2); + } + inline void conv_col2im_gpu_opt(Dtype* data) { + col2im_gpu_opt((Dtype*) transMem, 0, conv_in_channels_, conv_in_height_, + conv_in_width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, + stride_w_, data, bottom_offset_, + opt_num2); + } + private: + inline void conv_transform_gpu(const Dtype* temp_buffer, Dtype* top_data) { + transform_gpu((Dtype*) temp_buffer, top_data, top_offset_, N_, + M_ * opt_num2, opt_num2); + } + inline void conv_transpose_gpu(const Dtype* data) { + opttrans(data, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, + opt_num2); + } + protected: + inline void gpu_memset(Dtype* data, Dtype value, int count) { + ocl_memset(data, value, count); + } #endif - int conv_out_channels_; - int conv_in_channels_; - int conv_out_spatial_dim_; - int conv_in_height_; - int conv_in_width_; - int kernel_dim_; - int weight_offset_; - int col_offset_; - int output_offset_; - - Blob col_buffer_; - Blob bias_multiplier_; + private: + int conv_out_channels_; + int conv_in_channels_; + int conv_out_spatial_dim_; + int conv_in_height_; + int conv_in_width_; + int kernel_dim_; + + Blob col_buffer_; + Blob bias_multiplier_; + +//opencl related data structures + protected: + int opt_num2; + int M_, N_, K_; + int weight_offset_; + int col_offset_; + int output_offset_; + int top_offset_, top_offset_opt, bottom_offset_; + public: + static cl_mem subTopMem, transMem; + static size_t subtop_mem_size, trans_mem_size; }; /** @@ -128,52 +183,67 @@ class BaseConvolutionLayer : public Layer { * the output channel N' columns of the output matrix. */ template -class ConvolutionLayer : public BaseConvolutionLayer { - public: - /** - * @param param provides ConvolutionParameter convolution_param, - * with ConvolutionLayer options: - * - num_output. The number of filters. - * - kernel_size / kernel_h / kernel_w. The filter dimensions, given by - * kernel_size for square filters or kernel_h and kernel_w for rectangular - * filters. - * - stride / stride_h / stride_w (\b optional, default 1). The filter - * stride, given by stride_size for equal dimensions or stride_h and stride_w - * for different strides. By default the convolution is dense with stride 1. - * - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for - * convolution, given by pad for equal dimensions or pad_h and pad_w for - * different padding. Input padding is computed implicitly instead of - * actually padding. - * - group (\b optional, default 1). The number of filter groups. Group - * convolution is a method for reducing parameterization by selectively - * connecting input and output channels. The input and output channel dimensions must be divisible - * by the number of groups. For group @f$ \geq 1 @f$, the - * convolutional filters' input and output channels are separated s.t. each - * group takes 1 / group of the input channels and makes 1 / group of the - * output channels. Concretely 4 input channels, 8 output channels, and - * 2 groups separate input channels 1-2 and output channels 1-4 into the - * first group and input channels 3-4 and output channels 5-8 into the second - * group. - * - bias_term (\b optional, default true). Whether to have a bias. - * - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library - * kernels + stream parallelism) engines. - */ - explicit ConvolutionLayer(const LayerParameter& param) - : BaseConvolutionLayer(param) {} - - virtual inline const char* type() const { return "Convolution"; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual inline bool reverse_dimensions() { return false; } - virtual void compute_output_shape(); +class ConvolutionLayer: public BaseConvolutionLayer { + public: + /** + * @param param provides ConvolutionParameter convolution_param, + * with ConvolutionLayer options: + * - num_output. The number of filters. + * - kernel_size / kernel_h / kernel_w. The filter dimensions, given by + * kernel_size for square filters or kernel_h and kernel_w for rectangular + * filters. + * - stride / stride_h / stride_w (\b optional, default 1). The filter + * stride, given by stride_size for equal dimensions or stride_h and stride_w + * for different strides. By default the convolution is dense with stride 1. + * - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for + * convolution, given by pad for equal dimensions or pad_h and pad_w for + * different padding. Input padding is computed implicitly instead of + * actually padding. + * - group (\b optional, default 1). The number of filter groups. Group + * convolution is a method for reducing parameterization by selectively + * connecting input and output channels. The input and output channel dimensions must be divisible + * by the number of groups. For group @f$ \geq 1 @f$, the + * convolutional filters' input and output channels are separated s.t. each + * group takes 1 / group of the input channels and makes 1 / group of the + * output channels. Concretely 4 input channels, 8 output channels, and + * 2 groups separate input channels 1-2 and output channels 1-4 into the + * first group and input channels 3-4 and output channels 5-8 into the second + * group. + * - bias_term (\b optional, default true). Whether to have a bias. + * - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library + * kernels + stream parallelism) engines. + */ + explicit ConvolutionLayer(const LayerParameter& param) + : BaseConvolutionLayer(param) { + } + + virtual inline const char* type() const { + return "Convolution"; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual inline bool reverse_dimensions() { + return false; + } + virtual void compute_output_shape(); +#ifndef CPU_ONLY + virtual void Forward_gpu_org(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu_org(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Forward_gpu_batched(const vector*>& bottom, + const vector*>& top); + virtual void Backward_gpu_batched(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); +#endif }; /** @@ -191,24 +261,29 @@ class ConvolutionLayer : public BaseConvolutionLayer { * stride results in upsampling rather than downsampling). */ template -class DeconvolutionLayer : public BaseConvolutionLayer { - public: - explicit DeconvolutionLayer(const LayerParameter& param) - : BaseConvolutionLayer(param) {} - - virtual inline const char* type() const { return "Deconvolution"; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual inline bool reverse_dimensions() { return true; } - virtual void compute_output_shape(); +class DeconvolutionLayer: public BaseConvolutionLayer { + public: + explicit DeconvolutionLayer(const LayerParameter& param) + : BaseConvolutionLayer(param) { + } + + virtual inline const char* type() const { + return "Deconvolution"; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual inline bool reverse_dimensions() { + return true; + } + virtual void compute_output_shape(); }; #ifdef USE_CUDNN @@ -225,19 +300,19 @@ class DeconvolutionLayer : public BaseConvolutionLayer { * input and filter regimes the CUDNN engine is faster than the CAFFE engine, * but for fully-convolutional models and large inputs the CAFFE engine can be * faster as long as it fits in memory. -*/ + */ template class CuDNNConvolutionLayer : public ConvolutionLayer { - public: + public: explicit CuDNNConvolutionLayer(const LayerParameter& param) - : ConvolutionLayer(param), handles_setup_(false) {} + : ConvolutionLayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, const vector*>& top); virtual ~CuDNNConvolutionLayer(); - protected: + protected: virtual void Forward_gpu(const vector*>& bottom, const vector*>& top); virtual void Backward_gpu(const vector*>& top, @@ -245,10 +320,10 @@ class CuDNNConvolutionLayer : public ConvolutionLayer { bool handles_setup_; cudnnHandle_t* handle_; - cudaStream_t* stream_; + cudaStream_t* stream_; vector bottom_descs_, top_descs_; - cudnnTensorDescriptor_t bias_desc_; - cudnnFilterDescriptor_t filter_desc_; + cudnnTensorDescriptor_t bias_desc_; + cudnnFilterDescriptor_t filter_desc_; vector conv_descs_; int bottom_offset_, top_offset_, weight_offset_, bias_offset_; size_t workspaceSizeInBytes; @@ -264,34 +339,41 @@ class CuDNNConvolutionLayer : public ConvolutionLayer { * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template -class Im2colLayer : public Layer { - public: - explicit Im2colLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Im2col"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int channels_; - int height_, width_; - int pad_h_, pad_w_; +class Im2colLayer: public Layer { + public: + explicit Im2colLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Im2col"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; + int channels_; + int height_, width_; + int pad_h_, pad_w_; }; // Forward declare PoolingLayer and SplitLayer for use in LRNLayer. @@ -304,152 +386,168 @@ template class SplitLayer; * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template -class LRNLayer : public Layer { - public: - explicit LRNLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "LRN"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int ExactNumTopBlobs() const { return 1; } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +class LRNLayer: public Layer { + public: + explicit LRNLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "LRN"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int ExactNumTopBlobs() const { + return 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + virtual void CrossChannelForward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void CrossChannelForward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void WithinChannelForward(const vector*>& bottom, + const vector*>& top); + virtual void CrossChannelBackward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void CrossChannelBackward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void WithinChannelBackward(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int size_; + int pre_pad_; + Dtype alpha_; + Dtype beta_; + Dtype k_; + int num_; + int channels_; + int height_; + int width_; + + // Fields used for normalization ACROSS_CHANNELS + // scale_ stores the intermediate summing results + Blob scale_; + + // Fields used for normalization WITHIN_CHANNEL + shared_ptr > split_layer_; + vector*> split_top_vec_; + shared_ptr > square_layer_; + Blob square_input_; + Blob square_output_; + vector*> square_bottom_vec_; + vector*> square_top_vec_; + shared_ptr > pool_layer_; + Blob pool_output_; + vector*> pool_top_vec_; + shared_ptr > power_layer_; + Blob power_output_; + vector*> power_top_vec_; + shared_ptr > product_layer_; + Blob product_input_; + vector*> product_bottom_vec_; - virtual void CrossChannelForward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void CrossChannelForward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void WithinChannelForward(const vector*>& bottom, - const vector*>& top); - virtual void CrossChannelBackward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void CrossChannelBackward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void WithinChannelBackward(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - - int size_; - int pre_pad_; - Dtype alpha_; - Dtype beta_; - Dtype k_; - int num_; - int channels_; - int height_; - int width_; - - // Fields used for normalization ACROSS_CHANNELS - // scale_ stores the intermediate summing results - Blob scale_; - - // Fields used for normalization WITHIN_CHANNEL - shared_ptr > split_layer_; - vector*> split_top_vec_; - shared_ptr > square_layer_; - Blob square_input_; - Blob square_output_; - vector*> square_bottom_vec_; - vector*> square_top_vec_; - shared_ptr > pool_layer_; - Blob pool_output_; - vector*> pool_top_vec_; - shared_ptr > power_layer_; - Blob power_output_; - vector*> power_top_vec_; - shared_ptr > product_layer_; - Blob product_input_; - vector*> product_bottom_vec_; }; - -/** +/*n * @brief Pools the input image by taking the max, average, etc. within regions. * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template -class PoolingLayer : public Layer { - public: - explicit PoolingLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "Pooling"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 1; } - // MAX POOL layers can output an extra top blob for the mask; - // others can only output the pooled inputs. - virtual inline int MaxTopBlobs() const { - return (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX) ? 2 : 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Forward_gpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - virtual void Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); +class PoolingLayer: public Layer { + public: + explicit PoolingLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "Pooling"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + // MAX POOL layers can output an extra top blob for the mask; + // others can only output the pooled inputs. + virtual inline int MaxTopBlobs() const { + return + (this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_MAX) ? 2 : 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; + int pad_h_, pad_w_; + int channels_; + int height_, width_; + int pooled_height_, pooled_width_; + bool global_pooling_; + Blob rand_idx_; + Blob max_idx_; - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int pad_h_, pad_w_; - int channels_; - int height_, width_; - int pooled_height_, pooled_width_; - bool global_pooling_; - Blob rand_idx_; - Blob max_idx_; }; #ifdef USE_CUDNN /* * @brief cuDNN implementation of PoolingLayer. * Fallback to PoolingLayer for CPU mode. -*/ + */ template class CuDNNPoolingLayer : public PoolingLayer { - public: + public: explicit CuDNNPoolingLayer(const LayerParameter& param) - : PoolingLayer(param), handles_setup_(false) {} + : PoolingLayer(param), handles_setup_(false) {} virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, const vector*>& top); virtual ~CuDNNPoolingLayer(); // Currently, cuDNN does not support the extra top blob. - virtual inline int MinTopBlobs() const { return -1; } - virtual inline int ExactNumTopBlobs() const { return 1; } + virtual inline int MinTopBlobs() const {return -1;} + virtual inline int ExactNumTopBlobs() const {return 1;} - protected: + protected: virtual void Forward_gpu(const vector*>& bottom, const vector*>& top); virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); bool handles_setup_; - cudnnHandle_t handle_; + cudnnHandle_t handle_; cudnnTensorDescriptor_t bottom_desc_, top_desc_; - cudnnPoolingDescriptor_t pooling_desc_; - cudnnPoolingMode_t mode_; + cudnnPoolingDescriptor_t pooling_desc_; + cudnnPoolingMode_t mode_; }; #endif @@ -460,63 +558,71 @@ class CuDNNPoolingLayer : public PoolingLayer { * images are of the same size. */ template -class SPPLayer : public Layer { - public: - explicit SPPLayer(const LayerParameter& param) - : Layer(param) {} - virtual void LayerSetUp(const vector*>& bottom, - const vector*>& top); - virtual void Reshape(const vector*>& bottom, - const vector*>& top); - - virtual inline const char* type() const { return "SPP"; } - virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 1; } - // MAX POOL layers can output an extra top blob for the mask; - // others can only output the pooled inputs. - virtual inline int MaxTopBlobs() const { - return (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX) ? 2 : 1; - } - - protected: - virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); - virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); - // calculates the kernel and stride dimensions for the pooling layer, - // returns a correctly configured LayerParameter for a PoolingLayer - virtual LayerParameter GetPoolingParam(const int pyramid_level, - const int bottom_h, const int bottom_w, const SPPParameter spp_param); - - int pyramid_height_; - int bottom_h_, bottom_w_; - int channels_; - int kernel_h_, kernel_w_; - int pad_h_, pad_w_; - - /// the internal Split layer that feeds the pooling layers - shared_ptr > split_layer_; - /// top vector holder used in call to the underlying SplitLayer::Forward - vector*> split_top_vec_; - /// bottom vector holder used in call to the underlying PoolingLayer::Forward - vector*>*> pooling_bottom_vecs_; - /// the internal Pooling layers of different kernel sizes - vector > > pooling_layers_; - /// top vector holders used in call to the underlying PoolingLayer::Forward - vector*>*> pooling_top_vecs_; - /// pooling_outputs stores the outputs of the PoolingLayers - vector*> pooling_outputs_; - /// the internal Flatten layers that the Pooling layers feed into - vector*> flatten_layers_; - /// top vector holders used in call to the underlying FlattenLayer::Forward - vector*>*> flatten_top_vecs_; - /// flatten_outputs stores the outputs of the FlattenLayers - vector*> flatten_outputs_; - /// bottom vector holder used in call to the underlying ConcatLayer::Forward - vector*> concat_bottom_vec_; - /// the internal Concat layers that the Flatten layers feed into - shared_ptr > concat_layer_; +class SPPLayer: public Layer { + public: + explicit SPPLayer(const LayerParameter& param) + : Layer(param) { + } + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { + return "SPP"; + } + virtual inline int ExactNumBottomBlobs() const { + return 1; + } + virtual inline int MinTopBlobs() const { + return 1; + } + // MAX POOL layers can output an extra top blob for the mask; + // others can only output the pooled inputs. + virtual inline int MaxTopBlobs() const { + return + (this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_MAX) ? 2 : 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + // calculates the kernel and stride dimensions for the pooling layer, + // returns a correctly configured LayerParameter for a PoolingLayer + virtual LayerParameter GetPoolingParam(const int pyramid_level, + const int bottom_h, const int bottom_w, const SPPParameter spp_param); + + int pyramid_height_; + int bottom_h_, bottom_w_; + int channels_; + int kernel_h_, kernel_w_; + int pad_h_, pad_w_; + + /// the internal Split layer that feeds the pooling layers + shared_ptr > split_layer_; + /// top vector holder used in call to the underlying SplitLayer::Forward + vector*> split_top_vec_; + /// bottom vector holder used in call to the underlying PoolingLayer::Forward + vector*>*> pooling_bottom_vecs_; + /// the internal Pooling layers of different kernel sizes + vector > > pooling_layers_; + /// top vector holders used in call to the underlying PoolingLayer::Forward + vector*>*> pooling_top_vecs_; + /// pooling_outputs stores the outputs of the PoolingLayers + vector*> pooling_outputs_; + /// the internal Flatten layers that the Pooling layers feed into + vector*> flatten_layers_; + /// top vector holders used in call to the underlying FlattenLayer::Forward + vector*>*> flatten_top_vecs_; + /// flatten_outputs stores the outputs of the FlattenLayers + vector*> flatten_outputs_; + /// bottom vector holder used in call to the underlying ConcatLayer::Forward + vector*> concat_bottom_vec_; + /// the internal Concat layers that the Flatten layers feed into + shared_ptr > concat_layer_; }; } // namespace caffe diff --git a/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt b/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt new file mode 100644 index 00000000..37b1d0d3 --- /dev/null +++ b/models/bvlc_alexnet/bvlc_alexnet/solver_without_dropout.prototxt @@ -0,0 +1,14 @@ +net: "models/bvlc_alexnet/train_val_without_dropout.prototxt" +test_iter: 1 +test_interval: 1000 +base_lr: 0.01 +lr_policy: "step" +gamma: 0.1 +stepsize: 100000 +display: 1 +max_iter: 450000 +momentum: 0.9 +weight_decay: 0.0005 +snapshot: 10000 +snapshot_prefix: "models/bvlc_alexnet/caffe_alexnet_train" +solver_mode: GPU diff --git a/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt b/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt new file mode 100644 index 00000000..f269ca0d --- /dev/null +++ b/models/bvlc_alexnet/bvlc_alexnet/train_val_without_dropout.prototxt @@ -0,0 +1,366 @@ +name: "AlexNet" +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TRAIN + } + transform_param { + mirror: true + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" + batch_size: 256 + backend: LMDB + } +} +layer { + name: "data" + type: "Data" + top: "data" + top: "label" + include { + phase: TEST + } + transform_param { + mirror: false + crop_size: 227 + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" + } + data_param { + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" + batch_size: 50 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "norm1" + type: "LRN" + bottom: "conv1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "norm1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "norm2" + type: "LRN" + bottom: "conv2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "norm2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "pool2" + top: "conv3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 0.1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "fc8" + bottom: "label" + top: "accuracy" + include { + phase: TEST + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "fc8" + bottom: "label" + top: "loss" +} diff --git a/models/bvlc_alexnet/solver.prototxt b/models/bvlc_alexnet/solver.prototxt index 129265e6..6f23e9d1 100644 --- a/models/bvlc_alexnet/solver.prototxt +++ b/models/bvlc_alexnet/solver.prototxt @@ -1,11 +1,11 @@ net: "models/bvlc_alexnet/train_val.prototxt" -test_iter: 1000 +test_iter: 1 test_interval: 1000 base_lr: 0.01 lr_policy: "step" gamma: 0.1 stepsize: 100000 -display: 20 +display: 1 max_iter: 450000 momentum: 0.9 weight_decay: 0.0005 diff --git a/models/bvlc_alexnet/train_val.prototxt b/models/bvlc_alexnet/train_val.prototxt index 588b4ea7..1f9654be 100644 --- a/models/bvlc_alexnet/train_val.prototxt +++ b/models/bvlc_alexnet/train_val.prototxt @@ -10,10 +10,10 @@ layer { transform_param { mirror: true crop_size: 227 - mean_file: "data/ilsvrc12/imagenet_mean.binaryproto" + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" } data_param { - source: "examples/imagenet/ilsvrc12_train_lmdb" + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_train_lmdb" batch_size: 256 backend: LMDB } @@ -29,10 +29,10 @@ layer { transform_param { mirror: false crop_size: 227 - mean_file: "data/ilsvrc12/imagenet_mean.binaryproto" + mean_file: "/home/yugao/imagenet_data_lmdb/imagenet_mean.binaryproto" } data_param { - source: "examples/imagenet/ilsvrc12_val_lmdb" + source: "/home/yugao/imagenet_data_lmdb/ilsvrc12_val_lmdb" batch_size: 50 backend: LMDB } diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt index 40e6c11f..3e675c20 100644 --- a/src/caffe/CMakeLists.txt +++ b/src/caffe/CMakeLists.txt @@ -32,5 +32,3 @@ install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib) file(WRITE ${PROJECT_BINARY_DIR}/__init__.py) list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py) install(PROGRAMS ${proto_python} DESTINATION python/caffe/proto) - - diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index 94fdcc35..ece07d14 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -55,22 +55,20 @@ void Blob::ReshapeLike(const Blob& other) { template Blob::Blob(const int num, const int channels, const int height, const int width) - // capacity_ must be initialized before calling Reshape - : capacity_(0) { + : capacity_(0) { Reshape(num, channels, height, width); } template Blob::Blob(const vector& shape) - // capacity_ must be initialized before calling Reshape - : capacity_(0) { + : capacity_(0) { Reshape(shape); } template const Dtype* Blob::cpu_data() const { - CHECK(data_); - return (const Dtype*)data_->cpu_data(); + CHECK (data_); + return (const Dtype*) data_->cpu_data(); } template @@ -81,43 +79,49 @@ void Blob::set_cpu_data(Dtype* data) { template const Dtype* Blob::gpu_data() const { - CHECK(data_); - return (const Dtype*)data_->gpu_data(); + CHECK (data_); + return (const Dtype*) data_->gpu_data(); +} + +template +const Dtype* Blob::gpu_cache_data() const { + CHECK (data_); + return (const Dtype*) data_->gpu_cache_data(); } template const Dtype* Blob::cpu_diff() const { - CHECK(diff_); - return (const Dtype*)diff_->cpu_data(); + CHECK (diff_); + return (const Dtype*) diff_->cpu_data(); } template const Dtype* Blob::gpu_diff() const { - CHECK(diff_); - return (const Dtype*)diff_->gpu_data(); + CHECK (diff_); + return (const Dtype*) diff_->gpu_data(); } template Dtype* Blob::mutable_cpu_data() { - CHECK(data_); + CHECK (data_); return static_cast(data_->mutable_cpu_data()); } template Dtype* Blob::mutable_gpu_data() { - CHECK(data_); + CHECK (data_); return static_cast(data_->mutable_gpu_data()); } template Dtype* Blob::mutable_cpu_diff() { - CHECK(diff_); + CHECK (diff_); return static_cast(diff_->mutable_cpu_data()); } template Dtype* Blob::mutable_gpu_diff() { - CHECK(diff_); + CHECK (diff_); return static_cast(diff_->mutable_gpu_data()); } @@ -136,8 +140,12 @@ void Blob::ShareDiff(const Blob& other) { // The "update" method is used for parameter blobs in a Net, which are stored // as Blob or Blob -- hence we do not define it for // Blob or Blob. -template <> void Blob::Update() { NOT_IMPLEMENTED; } -template <> void Blob::Update() { NOT_IMPLEMENTED; } +template <> void Blob::Update() { + NOT_IMPLEMENTED; +} +template <> void Blob::Update() { + NOT_IMPLEMENTED; +} template void Blob::Update() { @@ -145,17 +153,15 @@ void Blob::Update() { switch (data_->head()) { case SyncedMemory::HEAD_AT_CPU: // perform computation on CPU - caffe_axpy(count_, Dtype(-1), - static_cast(diff_->cpu_data()), - static_cast(data_->mutable_cpu_data())); + caffe_axpy < Dtype + > (count_, Dtype(-1), static_cast(diff_->cpu_data()), static_cast(data_->mutable_cpu_data())); break; case SyncedMemory::HEAD_AT_GPU: case SyncedMemory::SYNCED: #ifndef CPU_ONLY // perform computation on GPU - caffe_gpu_axpy(count_, Dtype(-1), - static_cast(diff_->gpu_data()), - static_cast(data_->mutable_gpu_data())); + caffe_gpu_axpy < Dtype + > (count_, Dtype(-1), static_cast(diff_->gpu_data()), static_cast(data_->mutable_gpu_data())); #else NO_GPU; #endif @@ -177,7 +183,9 @@ template <> int Blob::asum_data() const { template Dtype Blob::asum_data() const { - if (!data_) { return 0; } + if (!data_) { + return 0; + } switch (data_->head()) { case SyncedMemory::HEAD_AT_CPU: return caffe_cpu_asum(count_, cpu_data()); @@ -212,7 +220,9 @@ template <> int Blob::asum_diff() const { template Dtype Blob::asum_diff() const { - if (!diff_) { return 0; } + if (!diff_) { + return 0; + } switch (diff_->head()) { case SyncedMemory::HEAD_AT_CPU: return caffe_cpu_asum(count_, cpu_diff()); @@ -249,7 +259,9 @@ template Dtype Blob::sumsq_data() const { Dtype sumsq; const Dtype* data; - if (!data_) { return 0; } + if (!data_) { + return 0; + } switch (data_->head()) { case SyncedMemory::HEAD_AT_CPU: data = cpu_data(); @@ -286,7 +298,9 @@ template Dtype Blob::sumsq_diff() const { Dtype sumsq; const Dtype* diff; - if (!diff_) { return 0; } + if (!diff_) { + return 0; + } switch (diff_->head()) { case SyncedMemory::HEAD_AT_CPU: diff = cpu_diff(); @@ -320,7 +334,9 @@ template <> void Blob::scale_data(int scale_factor) { template void Blob::scale_data(Dtype scale_factor) { Dtype* data; - if (!data_) { return; } + if (!data_) { + return; + } switch (data_->head()) { case SyncedMemory::HEAD_AT_CPU: data = mutable_cpu_data(); @@ -353,7 +369,9 @@ template <> void Blob::scale_diff(int scale_factor) { template void Blob::scale_diff(Dtype scale_factor) { Dtype* diff; - if (!diff_) { return; } + if (!diff_) { + return; + } switch (diff_->head()) { case SyncedMemory::HEAD_AT_CPU: diff = mutable_cpu_diff(); @@ -377,19 +395,17 @@ void Blob::scale_diff(Dtype scale_factor) { template bool Blob::ShapeEquals(const BlobProto& other) { - if (other.has_num() || other.has_channels() || - other.has_height() || other.has_width()) { + if (other.has_num() || other.has_channels() || other.has_height() + || other.has_width()) { // Using deprecated 4D Blob dimensions -- // shape is (num, channels, height, width). // Note: we do not use the normal Blob::num(), Blob::channels(), etc. // methods as these index from the beginning of the blob shape, where legacy // parameter blobs were indexed from the end of the blob shape (e.g., bias // Blob shape (1 x 1 x 1 x N), IP layer weight Blob shape (1 x 1 x M x N)). - return shape_.size() <= 4 && - LegacyShape(-4) == other.num() && - LegacyShape(-3) == other.channels() && - LegacyShape(-2) == other.height() && - LegacyShape(-1) == other.width(); + return shape_.size() <= 4 && LegacyShape(-4) == other.num() + && LegacyShape(-3) == other.channels() + && LegacyShape(-2) == other.height() && LegacyShape(-1) == other.width(); } vector other_shape(other.shape().dim_size()); for (int i = 0; i < other.shape().dim_size(); ++i) { @@ -410,10 +426,10 @@ void Blob::CopyFrom(const Blob& source, bool copy_diff, bool reshape) { switch (Caffe::mode()) { case Caffe::GPU: if (copy_diff) { - caffe_copy(count_, source.gpu_diff(), + caffe_gpu_copy(count_, source.gpu_diff(), static_cast(diff_->mutable_gpu_data())); } else { - caffe_copy(count_, source.gpu_data(), + caffe_gpu_copy(count_, source.gpu_data(), static_cast(data_->mutable_gpu_data())); } break; @@ -435,8 +451,8 @@ template void Blob::FromProto(const BlobProto& proto, bool reshape) { if (reshape) { vector shape; - if (proto.has_num() || proto.has_channels() || - proto.has_height() || proto.has_width()) { + if (proto.has_num() || proto.has_channels() || proto.has_height() + || proto.has_width()) { // Using deprecated 4D Blob dimensions -- // shape is (num, channels, height, width). shape.resize(4); @@ -487,9 +503,9 @@ void Blob::ToProto(BlobProto* proto, bool write_diff) const { } } -INSTANTIATE_CLASS(Blob); -template class Blob; -template class Blob; +INSTANTIATE_CLASS (Blob); +template class Blob ; +template class Blob ; } // namespace caffe diff --git a/src/caffe/cmake_install.cmake b/src/caffe/cmake_install.cmake new file mode 100644 index 00000000..f98ef538 --- /dev/null +++ b/src/caffe/cmake_install.cmake @@ -0,0 +1,79 @@ +# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe + +# Set the install prefix +IF(NOT DEFINED CMAKE_INSTALL_PREFIX) + SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install") +ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX) +STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") + +# Set the install configuration name. +IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) + IF(BUILD_TYPE) + STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" "" + CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}") + ELSE(BUILD_TYPE) + SET(CMAKE_INSTALL_CONFIG_NAME "Release") + ENDIF(BUILD_TYPE) + MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"") +ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) + +# Set the component getting installed. +IF(NOT CMAKE_INSTALL_COMPONENT) + IF(COMPONENT) + MESSAGE(STATUS "Install component: \"${COMPONENT}\"") + SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}") + ELSE(COMPONENT) + SET(CMAKE_INSTALL_COMPONENT) + ENDIF(COMPONENT) +ENDIF(NOT CMAKE_INSTALL_COMPONENT) + +# Install shared libraries without execute permission? +IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) + SET(CMAKE_INSTALL_SO_NO_EXE "1") +ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) + +IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include" TYPE DIRECTORY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe") +ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + +IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/caffe/proto" TYPE FILE FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe.pb.h") +ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + +IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + IF(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" AND + NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so") + FILE(RPATH_CHECK + FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" + RPATH "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install/lib:/usr/local/cuda/lib64:/usr/local/lib") + ENDIF() + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE SHARED_LIBRARY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib/libcaffe.so") + IF(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" AND + NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so") + FILE(RPATH_CHANGE + FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so" + OLD_RPATH "/usr/local/cuda/lib64:/usr/local/lib:::::::::::::::::::::::::::::::::::::::::::::::::::::::::" + NEW_RPATH "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install/lib:/usr/local/cuda/lib64:/usr/local/lib") + IF(CMAKE_INSTALL_DO_STRIP) + EXECUTE_PROCESS(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libcaffe.so") + ENDIF(CMAKE_INSTALL_DO_STRIP) + ENDIF() +ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + +IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE STATIC_LIBRARY FILES "/home/yugao/caffe-merge-junli/caffe-yb/caffe/lib/libproto.a") +ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + +IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/python/caffe/proto" TYPE PROGRAM FILES + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/include/caffe/proto/caffe_pb2.py" + "/home/yugao/caffe-merge-junli/caffe-yb/caffe/__init__.py" + ) +ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") + +IF(NOT CMAKE_INSTALL_LOCAL_ONLY) + # Include the install script for each subdirectory. + INCLUDE("/home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/cmake_install.cmake") + +ENDIF(NOT CMAKE_INSTALL_LOCAL_ONLY) + diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp index af96cac4..9ed4207a 100644 --- a/src/caffe/common.cpp +++ b/src/caffe/common.cpp @@ -11,25 +11,29 @@ shared_ptr Caffe::singleton_; // random seeding int64_t cluster_seedgen(void) { - int64_t s, seed, pid; - FILE* f = fopen("/dev/urandom", "rb"); - if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { - fclose(f); - return seed; - } - - LOG(INFO) << "System entropy source not available, " - "using fallback algorithm to generate seed instead."; - if (f) - fclose(f); - - pid = getpid(); - s = time(NULL); - seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729); - return seed; + //To fix: for now we use fixed seed to get same result each time + /* + int64_t s, seed, pid; + FILE* f = fopen("/dev/urandom", "rb"); + if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { + fclose(f); + return seed; + } + + LOG(INFO) << "System entropy source not available, " + "using fallback algorithm to generate seed instead."; + if (f) + fclose(f); + + pid = getpid(); + s = time(NULL); + seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729); + //return seed; + LOG(WARNING) << "return fixed seed 37"; + */ + return 37; } - void GlobalInit(int* pargc, char*** pargv) { // Google flags. ::gflags::ParseCommandLineFlags(pargc, pargv, true); @@ -42,9 +46,11 @@ void GlobalInit(int* pargc, char*** pargv) { #ifdef CPU_ONLY // CPU-only Caffe. Caffe::Caffe() - : random_generator_(), mode_(Caffe::CPU) { } +: random_generator_(), mode_(Caffe::CPU) { +} -Caffe::~Caffe() { } +Caffe::~Caffe() { +} void Caffe::set_random_seed(const unsigned int seed) { // RNG seed @@ -59,19 +65,18 @@ void Caffe::DeviceQuery() { NO_GPU; } - class Caffe::RNG::Generator { - public: + public: Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {} explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {} - caffe::rng_t* rng() { return rng_.get(); } - private: + caffe::rng_t* rng() {return rng_.get();} + private: shared_ptr rng_; }; -Caffe::RNG::RNG() : generator_(new Generator()) { } +Caffe::RNG::RNG() : generator_(new Generator()) {} -Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { } +Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) {} Caffe::RNG& Caffe::RNG::operator=(const RNG& other) { generator_ = other.generator_; @@ -84,116 +89,58 @@ void* Caffe::RNG::generator() { #else // Normal GPU + CPU Caffe. -Caffe::Caffe() - : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(), - mode_(Caffe::CPU) { - // Try to create a cublas handler, and report an error if failed (but we will - // keep the program running as one might just want to run CPU code). - if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { - LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available."; - } - // Try to create a curand handler. - if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) - != CURAND_STATUS_SUCCESS || - curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) - != CURAND_STATUS_SUCCESS) { - LOG(ERROR) << "Cannot create Curand generator. Curand won't be available."; +Caffe::Caffe() { + amdDevice.Init(); + cl_int err = clblasSetup(); + if (err != CL_SUCCESS) { + LOG(ERROR) << "clBLAS setup failed " << err; } } Caffe::~Caffe() { - if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (curand_generator_) { - CURAND_CHECK(curandDestroyGenerator(curand_generator_)); - } + clblasTeardown(); } void Caffe::set_random_seed(const unsigned int seed) { - // Curand seed - static bool g_curand_availability_logged = false; - if (Get().curand_generator_) { - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator(), - seed)); - CURAND_CHECK(curandSetGeneratorOffset(curand_generator(), 0)); - } else { - if (!g_curand_availability_logged) { - LOG(ERROR) << - "Curand not available. Skipping setting the curand seed."; - g_curand_availability_logged = true; - } - } - // RNG seed - Get().random_generator_.reset(new RNG(seed)); + // RNG seed + Get().random_generator_.reset(new RNG(seed)); + caffe_gpu_uniform(0, NULL, seed); + caffe_gpu_uniform((float*)NULL, 0, (float)0.0, (float)1.0, seed); } void Caffe::SetDevice(const int device_id) { - int current_device; - CUDA_CHECK(cudaGetDevice(¤t_device)); - if (current_device == device_id) { + if (amdDevice.GetDevice() == device_id) { return; } - // The call to cudaSetDevice must come before any calls to Get, which - // may perform initialization using the GPU. - CUDA_CHECK(cudaSetDevice(device_id)); - if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); - if (Get().curand_generator_) { - CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); - } - CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); - CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, - CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, - cluster_seedgen())); + amdDevice.Init(device_id); } void Caffe::DeviceQuery() { - cudaDeviceProp prop; - int device; - if (cudaSuccess != cudaGetDevice(&device)) { - printf("No cuda device present.\n"); - return; - } - CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - LOG(INFO) << "Device id: " << device; - LOG(INFO) << "Major revision number: " << prop.major; - LOG(INFO) << "Minor revision number: " << prop.minor; - LOG(INFO) << "Name: " << prop.name; - LOG(INFO) << "Total global memory: " << prop.totalGlobalMem; - LOG(INFO) << "Total shared memory per block: " << prop.sharedMemPerBlock; - LOG(INFO) << "Total registers per block: " << prop.regsPerBlock; - LOG(INFO) << "Warp size: " << prop.warpSize; - LOG(INFO) << "Maximum memory pitch: " << prop.memPitch; - LOG(INFO) << "Maximum threads per block: " << prop.maxThreadsPerBlock; - LOG(INFO) << "Maximum dimension of block: " - << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", " - << prop.maxThreadsDim[2]; - LOG(INFO) << "Maximum dimension of grid: " - << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", " - << prop.maxGridSize[2]; - LOG(INFO) << "Clock rate: " << prop.clockRate; - LOG(INFO) << "Total constant memory: " << prop.totalConstMem; - LOG(INFO) << "Texture alignment: " << prop.textureAlignment; - LOG(INFO) << "Concurrent copy and execution: " - << (prop.deviceOverlap ? "Yes" : "No"); - LOG(INFO) << "Number of multiprocessors: " << prop.multiProcessorCount; - LOG(INFO) << "Kernel execution timeout: " - << (prop.kernelExecTimeoutEnabled ? "Yes" : "No"); - return; + amdDevice.DeviceQuery(); } - class Caffe::RNG::Generator { - public: - Generator() : rng_(new caffe::rng_t(cluster_seedgen())) {} - explicit Generator(unsigned int seed) : rng_(new caffe::rng_t(seed)) {} - caffe::rng_t* rng() { return rng_.get(); } - private: - shared_ptr rng_; + public: + Generator() + : rng_(new caffe::rng_t(cluster_seedgen())) { + } + explicit Generator(unsigned int seed) + : rng_(new caffe::rng_t(seed)) { + } + caffe::rng_t* rng() { + return rng_.get(); + } + private: + shared_ptr rng_; }; -Caffe::RNG::RNG() : generator_(new Generator()) { } +Caffe::RNG::RNG() + : generator_(new Generator()) { +} -Caffe::RNG::RNG(unsigned int seed) : generator_(new Generator(seed)) { } +Caffe::RNG::RNG(unsigned int seed) + : generator_(new Generator(seed)) { +} Caffe::RNG& Caffe::RNG::operator=(const RNG& other) { generator_.reset(other.generator_.get()); @@ -204,68 +151,6 @@ void* Caffe::RNG::generator() { return static_cast(generator_->rng()); } -const char* cublasGetErrorString(cublasStatus_t error) { - switch (error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; -#if CUDA_VERSION >= 6000 - case CUBLAS_STATUS_NOT_SUPPORTED: - return "CUBLAS_STATUS_NOT_SUPPORTED"; -#endif -#if CUDA_VERSION >= 6050 - case CUBLAS_STATUS_LICENSE_ERROR: - return "CUBLAS_STATUS_LICENSE_ERROR"; -#endif - } - return "Unknown cublas status"; -} - -const char* curandGetErrorString(curandStatus_t error) { - switch (error) { - case CURAND_STATUS_SUCCESS: - return "CURAND_STATUS_SUCCESS"; - case CURAND_STATUS_VERSION_MISMATCH: - return "CURAND_STATUS_VERSION_MISMATCH"; - case CURAND_STATUS_NOT_INITIALIZED: - return "CURAND_STATUS_NOT_INITIALIZED"; - case CURAND_STATUS_ALLOCATION_FAILED: - return "CURAND_STATUS_ALLOCATION_FAILED"; - case CURAND_STATUS_TYPE_ERROR: - return "CURAND_STATUS_TYPE_ERROR"; - case CURAND_STATUS_OUT_OF_RANGE: - return "CURAND_STATUS_OUT_OF_RANGE"; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case CURAND_STATUS_LAUNCH_FAILURE: - return "CURAND_STATUS_LAUNCH_FAILURE"; - case CURAND_STATUS_PREEXISTING_FAILURE: - return "CURAND_STATUS_PREEXISTING_FAILURE"; - case CURAND_STATUS_INITIALIZATION_FAILED: - return "CURAND_STATUS_INITIALIZATION_FAILED"; - case CURAND_STATUS_ARCH_MISMATCH: - return "CURAND_STATUS_ARCH_MISMATCH"; - case CURAND_STATUS_INTERNAL_ERROR: - return "CURAND_STATUS_INTERNAL_ERROR"; - } - return "Unknown curand status"; -} - #endif // CPU_ONLY } // namespace caffe diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp index 22633922..1137bac3 100644 --- a/src/caffe/data_transformer.cpp +++ b/src/caffe/data_transformer.cpp @@ -7,17 +7,17 @@ #include "caffe/util/io.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/util/rng.hpp" - +#include "caffe/util/benchmark.hpp" namespace caffe { -template +template DataTransformer::DataTransformer(const TransformationParameter& param, Phase phase) : param_(param), phase_(phase) { // check if we want to use mean_file if (param_.has_mean_file()) { - CHECK_EQ(param_.mean_value_size(), 0) << - "Cannot specify mean_file and mean_value at the same time"; + CHECK_EQ(param_.mean_value_size(), 0) + << "Cannot specify mean_file and mean_value at the same time"; const string& mean_file = param.mean_file(); LOG(INFO) << "Loading mean file from: " << mean_file; BlobProto blob_proto; @@ -26,17 +26,17 @@ DataTransformer::DataTransformer(const TransformationParameter& param, } // check if we want to use mean_value if (param_.mean_value_size() > 0) { - CHECK(param_.has_mean_file() == false) << - "Cannot specify mean_file and mean_value at the same time"; + CHECK(param_.has_mean_file() == false) + << "Cannot specify mean_file and mean_value at the same time"; for (int c = 0; c < param_.mean_value_size(); ++c) { mean_values_.push_back(param_.mean_value(c)); } } } -template +template void DataTransformer::Transform(const Datum& datum, - Dtype* transformed_data) { + Dtype* transformed_data) { const string& data = datum.data(); const int datum_channels = datum.channels(); const int datum_height = datum.height(); @@ -61,8 +61,9 @@ void DataTransformer::Transform(const Datum& datum, mean = data_mean_.mutable_cpu_data(); } if (has_mean_values) { - CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) << - "Specify either 1 mean_value or as many as channels: " << datum_channels; + CHECK(mean_values_.size() == 1 || mean_values_.size() == datum_channels) + << "Specify either 1 mean_value or as many as channels: " + << datum_channels; if (datum_channels > 1 && mean_values_.size() == 1) { // Replicate the mean_value for simplicity for (int c = 1; c < datum_channels; ++c) { @@ -102,17 +103,17 @@ void DataTransformer::Transform(const Datum& datum, } if (has_uint8) { datum_element = - static_cast(static_cast(data[data_index])); + static_cast(static_cast(data[data_index])); } else { datum_element = datum.float_data(data_index); } if (has_mean_file) { - transformed_data[top_index] = - (datum_element - mean[data_index]) * scale; + transformed_data[top_index] = (datum_element - mean[data_index]) + * scale; } else { if (has_mean_values) { - transformed_data[top_index] = - (datum_element - mean_values_[c]) * scale; + transformed_data[top_index] = (datum_element - mean_values_[c]) + * scale; } else { transformed_data[top_index] = datum_element * scale; } @@ -122,16 +123,17 @@ void DataTransformer::Transform(const Datum& datum, } } -template +template void DataTransformer::Transform(const Datum& datum, - Blob* transformed_blob) { + Blob* transformed_blob) { + // If datum is encoded, decoded and transform the cv::image. if (datum.encoded()) { CHECK(!(param_.force_color() && param_.force_gray())) << "cannot set both force_color and force_gray"; cv::Mat cv_img; if (param_.force_color() || param_.force_gray()) { - // If force_color then decode in color otherwise decode in gray. + // If force_color then decode in color otherwise decode in gray. cv_img = DecodeDatumToCVMat(datum, param_.force_color()); } else { cv_img = DecodeDatumToCVMatNative(datum); @@ -172,9 +174,9 @@ void DataTransformer::Transform(const Datum& datum, Transform(datum, transformed_data); } -template +template void DataTransformer::Transform(const vector & datum_vector, - Blob* transformed_blob) { + Blob* transformed_blob) { const int datum_num = datum_vector.size(); const int num = transformed_blob->num(); const int channels = transformed_blob->channels(); @@ -182,9 +184,9 @@ void DataTransformer::Transform(const vector & datum_vector, const int width = transformed_blob->width(); CHECK_GT(datum_num, 0) << "There is no datum to add"; - CHECK_LE(datum_num, num) << - "The size of datum_vector must be no greater than transformed_blob->num()"; - Blob uni_blob(1, channels, height, width); + CHECK_LE(datum_num, num) + << "The size of datum_vector must be no greater than transformed_blob->num()"; + Blob < Dtype > uni_blob(1, channels, height, width); for (int item_id = 0; item_id < datum_num; ++item_id) { int offset = transformed_blob->offset(item_id); uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset); @@ -192,9 +194,9 @@ void DataTransformer::Transform(const vector & datum_vector, } } -template +template void DataTransformer::Transform(const vector & mat_vector, - Blob* transformed_blob) { + Blob* transformed_blob) { const int mat_num = mat_vector.size(); const int num = transformed_blob->num(); const int channels = transformed_blob->channels(); @@ -202,9 +204,9 @@ void DataTransformer::Transform(const vector & mat_vector, const int width = transformed_blob->width(); CHECK_GT(mat_num, 0) << "There is no MAT to add"; - CHECK_EQ(mat_num, num) << - "The size of mat_vector must be equals to transformed_blob->num()"; - Blob uni_blob(1, channels, height, width); + CHECK_EQ(mat_num, num) + << "The size of mat_vector must be equals to transformed_blob->num()"; + Blob < Dtype > uni_blob(1, channels, height, width); for (int item_id = 0; item_id < mat_num; ++item_id) { int offset = transformed_blob->offset(item_id); uni_blob.set_cpu_data(transformed_blob->mutable_cpu_data() + offset); @@ -212,9 +214,9 @@ void DataTransformer::Transform(const vector & mat_vector, } } -template +template void DataTransformer::Transform(const cv::Mat& cv_img, - Blob* transformed_blob) { + Blob* transformed_blob) { const int crop_size = param_.crop_size(); const int img_channels = cv_img.channels(); const int img_height = cv_img.rows; @@ -250,8 +252,9 @@ void DataTransformer::Transform(const cv::Mat& cv_img, mean = data_mean_.mutable_cpu_data(); } if (has_mean_values) { - CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) << - "Specify either 1 mean_value or as many as channels: " << img_channels; + CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) + << "Specify either 1 mean_value or as many as channels: " + << img_channels; if (img_channels > 1 && mean_values_.size() == 1) { // Replicate the mean_value for simplicity for (int c = 1; c < img_channels; ++c) { @@ -286,7 +289,7 @@ void DataTransformer::Transform(const cv::Mat& cv_img, Dtype* transformed_data = transformed_blob->mutable_cpu_data(); int top_index; for (int h = 0; h < height; ++h) { - const uchar* ptr = cv_cropped_img.ptr(h); + const uchar* ptr = cv_cropped_img.ptr < uchar > (h); int img_index = 0; for (int w = 0; w < width; ++w) { for (int c = 0; c < img_channels; ++c) { @@ -299,12 +302,10 @@ void DataTransformer::Transform(const cv::Mat& cv_img, Dtype pixel = static_cast(ptr[img_index++]); if (has_mean_file) { int mean_index = (c * img_height + h_off + h) * img_width + w_off + w; - transformed_data[top_index] = - (pixel - mean[mean_index]) * scale; + transformed_data[top_index] = (pixel - mean[mean_index]) * scale; } else { if (has_mean_values) { - transformed_data[top_index] = - (pixel - mean_values_[c]) * scale; + transformed_data[top_index] = (pixel - mean_values_[c]) * scale; } else { transformed_data[top_index] = pixel * scale; } @@ -314,9 +315,9 @@ void DataTransformer::Transform(const cv::Mat& cv_img, } } -template +template void DataTransformer::Transform(Blob* input_blob, - Blob* transformed_blob) { + Blob* transformed_blob) { const int crop_size = param_.crop_size(); const int input_num = input_blob->num(); const int input_channels = input_blob->channels(); @@ -326,11 +327,11 @@ void DataTransformer::Transform(Blob* input_blob, if (transformed_blob->count() == 0) { // Initialize transformed_blob with the right shape. if (crop_size) { - transformed_blob->Reshape(input_num, input_channels, - crop_size, crop_size); + transformed_blob->Reshape(input_num, input_channels, crop_size, + crop_size); } else { - transformed_blob->Reshape(input_num, input_channels, - input_height, input_width); + transformed_blob->Reshape(input_num, input_channels, input_height, + input_width); } } @@ -345,7 +346,6 @@ void DataTransformer::Transform(Blob* input_blob, CHECK_GE(input_height, height); CHECK_GE(input_width, width); - const Dtype scale = param_.scale(); const bool do_mirror = param_.mirror() && Rand(2); const bool has_mean_file = param_.has_mean_file(); @@ -376,14 +376,15 @@ void DataTransformer::Transform(Blob* input_blob, CHECK_EQ(input_width, data_mean_.width()); for (int n = 0; n < input_num; ++n) { int offset = input_blob->offset(n); - caffe_sub(data_mean_.count(), input_data + offset, - data_mean_.cpu_data(), input_data + offset); + caffe_sub(data_mean_.count(), input_data + offset, data_mean_.cpu_data(), + input_data + offset); } } if (has_mean_values) { - CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) << - "Specify either 1 mean_value or as many as channels: " << input_channels; + CHECK(mean_values_.size() == 1 || mean_values_.size() == input_channels) + << "Specify either 1 mean_value or as many as channels: " + << input_channels; if (mean_values_.size() == 1) { caffe_add_scalar(input_blob->count(), -(mean_values_[0]), input_data); } else { @@ -391,7 +392,7 @@ void DataTransformer::Transform(Blob* input_blob, for (int c = 0; c < input_channels; ++c) { int offset = input_blob->offset(n, c); caffe_add_scalar(input_height * input_width, -(mean_values_[c]), - input_data + offset); + input_data + offset); } } } @@ -411,7 +412,7 @@ void DataTransformer::Transform(Blob* input_blob, if (do_mirror) { int top_index_w = top_index_h + width - 1; for (int w = 0; w < width; ++w) { - transformed_data[top_index_w-w] = input_data[data_index_h + w]; + transformed_data[top_index_w - w] = input_data[data_index_h + w]; } } else { for (int w = 0; w < width; ++w) { @@ -427,14 +428,14 @@ void DataTransformer::Transform(Blob* input_blob, } } -template +template vector DataTransformer::InferBlobShape(const Datum& datum) { if (datum.encoded()) { CHECK(!(param_.force_color() && param_.force_gray())) << "cannot set both force_color and force_gray"; cv::Mat cv_img; if (param_.force_color() || param_.force_gray()) { - // If force_color then decode in color otherwise decode in gray. + // If force_color then decode in color otherwise decode in gray. cv_img = DecodeDatumToCVMat(datum, param_.force_color()); } else { cv_img = DecodeDatumToCVMatNative(datum); @@ -455,12 +456,12 @@ vector DataTransformer::InferBlobShape(const Datum& datum) { vector shape(4); shape[0] = 1; shape[1] = datum_channels; - shape[2] = (crop_size)? crop_size: datum_height; - shape[3] = (crop_size)? crop_size: datum_width; + shape[2] = (crop_size) ? crop_size : datum_height; + shape[3] = (crop_size) ? crop_size : datum_width; return shape; } -template +template vector DataTransformer::InferBlobShape( const vector & datum_vector) { const int num = datum_vector.size(); @@ -472,7 +473,7 @@ vector DataTransformer::InferBlobShape( return shape; } -template +template vector DataTransformer::InferBlobShape(const cv::Mat& cv_img) { const int crop_size = param_.crop_size(); const int img_channels = cv_img.channels(); @@ -486,12 +487,12 @@ vector DataTransformer::InferBlobShape(const cv::Mat& cv_img) { vector shape(4); shape[0] = 1; shape[1] = img_channels; - shape[2] = (crop_size)? crop_size: img_height; - shape[3] = (crop_size)? crop_size: img_width; + shape[2] = (crop_size) ? crop_size : img_height; + shape[3] = (crop_size) ? crop_size : img_width; return shape; } -template +template vector DataTransformer::InferBlobShape( const vector & mat_vector) { const int num = mat_vector.size(); @@ -505,8 +506,8 @@ vector DataTransformer::InferBlobShape( template void DataTransformer::InitRand() { - const bool needs_rand = param_.mirror() || - (phase_ == TRAIN && param_.crop_size()); + const bool needs_rand = param_.mirror() + || (phase_ == TRAIN && param_.crop_size()); if (needs_rand) { const unsigned int rng_seed = caffe_rng_rand(); rng_.reset(new Caffe::RNG(rng_seed)); @@ -517,13 +518,12 @@ void DataTransformer::InitRand() { template int DataTransformer::Rand(int n) { - CHECK(rng_); + CHECK (rng_); CHECK_GT(n, 0); - caffe::rng_t* rng = - static_cast(rng_->generator()); + caffe::rng_t* rng = static_cast(rng_->generator()); return ((*rng)() % n); } -INSTANTIATE_CLASS(DataTransformer); +INSTANTIATE_CLASS (DataTransformer); } // namespace caffe diff --git a/src/caffe/device.cpp b/src/caffe/device.cpp new file mode 100644 index 00000000..fcbffe09 --- /dev/null +++ b/src/caffe/device.cpp @@ -0,0 +1,426 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +#include "caffe/common.hpp" +#include "caffe/device.hpp" +#include +#include +#include +#include +#include + +namespace caffe { +#ifndef CPU_ONLY +string buildOption = "-x clc++ "; +std::string oclKernelPath = "./src/caffe/ocl/"; +Device amdDevice; + +Device::~Device() { + ReleaseKernels(); + free((void*) platformIDs); + free (DeviceIDs); + clReleaseProgram (Program); + clReleaseCommandQueue (CommandQueue); + clReleaseCommandQueue (CommandQueue_helper); + clReleaseContext (Context); + LOG(INFO) << "device destructor"; +} + +cl_int Device::Init(int deviceId) { + + DisplayPlatformInfo(); + + clGetPlatformIDs(0, NULL, &numPlatforms); + cl_platform_id PlatformIDs[numPlatforms]; + clGetPlatformIDs(numPlatforms, PlatformIDs, NULL); + + size_t nameLen; + cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, + platformName, &nameLen); + if (res != CL_SUCCESS) { + fprintf(stderr, "Err: Failed to Get Platform Info\n"); + return 0; + } + platformName[nameLen] = 0; + + GetDeviceInfo(); + cl_uint uiNumDevices; + cl_bool unified_memory = false; + clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); + uiNumDevices = numDevices; + if (0 == uiNumDevices) { + LOG(FATAL) << "Err: No GPU devices"; + } else { + pDevices = (cl_device_id *) malloc(uiNumDevices * sizeof(cl_device_id)); + OCL_CHECK( + clGetDeviceIDs(PlatformIDs[0], CL_DEVICE_TYPE_GPU, uiNumDevices, + pDevices, &uiNumDevices)); + if (deviceId == -1) { + int i; + for (i = 0; i < (int) uiNumDevices; i++) { + clGetDeviceInfo(pDevices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, + sizeof(cl_bool), &unified_memory, NULL); + if (!unified_memory) { //skip iGPU + //we pick the first dGPU we found + pDevices[0] = pDevices[i]; + device_id = i; + LOG(INFO) << "Picked default device type : dGPU " << device_id; + break; + } + } + if (i == uiNumDevices) { + LOG(FATAL) << "Cannot find any dGPU! "; + } + } else if (deviceId >= 0 && deviceId < uiNumDevices) { + pDevices[0] = pDevices[deviceId]; + device_id = deviceId; + LOG(INFO) << "Picked device type : GPU " << device_id; + } else { + LOG(FATAL) << " Invalid GPU deviceId! "; + } + } + + Context = clCreateContext(NULL, 1, pDevices, NULL, NULL, NULL); + if (NULL == Context) { + fprintf(stderr, "Err: Failed to Create Context\n"); + return 0; + } + CommandQueue = clCreateCommandQueue(Context, pDevices[0], + CL_QUEUE_PROFILING_ENABLE, NULL); + CommandQueue_helper = clCreateCommandQueue(Context, pDevices[0], + CL_QUEUE_PROFILING_ENABLE, NULL); + if (NULL == CommandQueue || NULL == CommandQueue_helper) { + fprintf(stderr, "Err: Failed to Create Commandqueue\n"); + return 0; + } + BuildProgram (oclKernelPath); + row = clblasRowMajor; + col = clblasColumnMajor; + return 0; +} + +void Device::BuildProgram(std::string kernel_dir) { + std::string strSource = ""; + DIR *ocl_dir; + struct dirent *dirp; + if ((ocl_dir = opendir(kernel_dir.c_str())) == NULL) { + fprintf(stderr, "Err: Open ocl dir failed!\n"); + } + while ((dirp = readdir(ocl_dir)) != NULL) { + //Ignore hidden files + if (dirp->d_name[0] == '.') + continue; + std::string file_name = std::string(dirp->d_name); + //Skip non *.cl files + size_t last_dot_pos = file_name.find_last_of("."); + if (file_name.substr(last_dot_pos + 1) != "cl") + continue; + + std::string ocl_kernel_full_path = kernel_dir + file_name; + std::string tmpSource = ""; + ConvertToString(ocl_kernel_full_path.c_str(), tmpSource); + strSource += tmpSource; + } + const char *pSource; + pSource = strSource.c_str(); + size_t uiArrSourceSize[] = { 0 }; + uiArrSourceSize[0] = strlen(pSource); + Program = NULL; + Program = clCreateProgramWithSource(Context, 1, &pSource, uiArrSourceSize, + NULL); + if (NULL == Program) { + fprintf(stderr, "Err: Failed to create program\n"); + } + cl_int iStatus = clBuildProgram(Program, 1, pDevices, buildOption.c_str(), + NULL, NULL); + LOG(INFO) << "Build Program"; + if (CL_SUCCESS != iStatus) { + fprintf(stderr, "Err: Failed to build program\n"); + char szBuildLog[16384]; + clGetProgramBuildInfo(Program, *pDevices, CL_PROGRAM_BUILD_LOG, + sizeof(szBuildLog), szBuildLog, NULL); + std::cout << szBuildLog; + clReleaseProgram (Program); + } +} + +//Use to read OpenCL source code +cl_int Device::ConvertToString(std::string pFileName, std::string &Str) { + size_t uiSize = 0; + size_t uiFileSize = 0; + char *pStr = NULL; + char *tmp = (char*) pFileName.data(); + std::fstream fFile(tmp, (std::fstream::in | std::fstream::binary)); + if (fFile.is_open()) { + fFile.seekg(0, std::fstream::end); + uiSize = uiFileSize = (size_t) fFile.tellg(); + fFile.seekg(0, std::fstream::beg); + pStr = new char[uiSize + 1]; + + if (NULL == pStr) { + fFile.close(); + return 0; + } + fFile.read(pStr, uiFileSize); + fFile.close(); + pStr[uiSize] = '\0'; + Str = pStr; + delete[] pStr; + return 0; + } + LOG(ERROR) << "Err: Failed to open cl file!"; + return -1; +} + +cl_kernel Device::GetKernel(std::string kernel_name) { + std::map::iterator it = Kernels.find(kernel_name); + if (it == Kernels.end()) { + cl_int _err = 0; + cl_kernel kernel = clCreateKernel(Program, kernel_name.c_str(), &_err); + OCL_CHECK(_err); + Kernels[kernel_name] = kernel; + } + return Kernels[kernel_name]; +} + +void Device::ReleaseKernels() { + std::map::iterator it; + for (it = Kernels.begin(); it != Kernels.end(); it++) { + clReleaseKernel(it->second); + } +} + +void Device::DisplayPlatformInfo() { + cl_int err; + + err = clGetPlatformIDs(0, NULL, &numPlatforms); + if (err != CL_SUCCESS || numPlatforms <= 0) { + LOG(ERROR) << "Failed to find any OpenCL platform."; + return; + } + + platformIDs = (cl_platform_id *) malloc( + sizeof(cl_platform_id) * numPlatforms); + err = clGetPlatformIDs(numPlatforms, platformIDs, NULL); + if (err != CL_SUCCESS) { + LOG(ERROR) << "Failed to find any OpenCL platform."; + return; + } + + LOG(INFO) << "Number of platforms found:" << numPlatforms; + + //iterate through the list of platforms displaying platform information + for (cl_uint i = 0; i < numPlatforms; i++) { + DisplayInfo(platformIDs[i], CL_PLATFORM_NAME, "CL_PLATFORM_NAME"); + DisplayInfo(platformIDs[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE"); + DisplayInfo(platformIDs[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION"); + DisplayInfo(platformIDs[i], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); + DisplayInfo(platformIDs[i], CL_PLATFORM_EXTENSIONS, + "CL_PLATFORM_EXTENSIONS"); + } + +} + +void Device::DisplayInfo(cl_platform_id id, cl_platform_info name, + std::string str) { + cl_int err; + std::size_t paramValueSize; + + err = clGetPlatformInfo(id, name, 0, NULL, ¶mValueSize); + if (err != CL_SUCCESS) { + LOG(ERROR) << "Failed to find OpenCL platform:" << str; + return; + } + + char * info = (char *) alloca(sizeof(char) * paramValueSize); + err = clGetPlatformInfo(id, name, paramValueSize, info, NULL); + if (err != CL_SUCCESS) { + LOG(ERROR) << "Failed to find OpenCL platform:" << str; + return; + } + + LOG(INFO) << "\t" << str << "\t" << info; +} + +void Device::GetDeviceInfo() { + cl_int err; + //by default, we select the first platform. can be extended for more platforms + //query GPU device for now + err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, + &numDevices); + // we allow program run if no GPU is found. Just return. No error reported. + if (numDevices < 1) { + LOG(INFO) << "No GPU Devices found for platform" << platformIDs[0]; + LOG(WARNING) << "No GPU Devices found for platform" << platformIDs[0]; + return; + } + + DeviceIDs = (cl_device_id *) malloc(sizeof(cl_device_id) * numDevices); + err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, + DeviceIDs, NULL); + if (err != CL_SUCCESS) { + LOG(INFO) << "Failed to find any GPU devices."; + return; + } + + LOG(INFO) << "Number of devices found:" << numDevices; + for (cl_uint i = 0; i < numDevices; i++) { + LOG(INFO) << "\t" << "DeviceID" << ":\t" << DeviceIDs[i]; + DisplayDeviceInfo < cl_device_type + > (DeviceIDs[i], CL_DEVICE_TYPE, "Device Type"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Is it integrated GPU?"); + DisplayDeviceInfo < cl_uint + > (DeviceIDs[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, "Max clock frequency MHz"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_HOST_UNIFIED_MEMORY, "Host-Device unified mem"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_ERROR_CORRECTION_SUPPORT, "ECC support"); + DisplayDeviceInfo < cl_bool + > (DeviceIDs[i], CL_DEVICE_ENDIAN_LITTLE, "Endian little"); + DisplayDeviceInfo < cl_uint + > (DeviceIDs[i], CL_DEVICE_MAX_COMPUTE_UNITS, "Max compute units"); + DisplayDeviceInfo < size_t + > (DeviceIDs[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "Max work group size"); + DisplayDeviceInfo < cl_uint + > (DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "Max work item dimensions"); + DisplayDeviceInfo(DeviceIDs[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, + "Max work item sizes"); + DisplayDeviceInfo < cl_command_queue_properties + > (DeviceIDs[i], CL_DEVICE_QUEUE_PROPERTIES, "CL_DEVICE_QUEUE_PROPERTIES"); + DisplayDeviceInfo < cl_device_exec_capabilities + > (DeviceIDs[i], CL_DEVICE_EXECUTION_CAPABILITIES, "CL_DEVICE_EXECUTION_CAPABILITIES"); + DisplayDeviceInfo < cl_ulong + > (DeviceIDs[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "Max mem alloc size"); + DisplayDeviceInfo < cl_ulong + > (DeviceIDs[i], CL_DEVICE_GLOBAL_MEM_SIZE, "Global mem size"); + DisplayDeviceInfo < cl_ulong + > (DeviceIDs[i], CL_DEVICE_LOCAL_MEM_SIZE, "Local mem size"); + } + +} + +void Device::DeviceQuery() { + DisplayPlatformInfo(); + + clGetPlatformIDs(0, NULL, &numPlatforms); + cl_platform_id PlatformIDs[numPlatforms]; + clGetPlatformIDs(numPlatforms, PlatformIDs, NULL); + + size_t nameLen; + cl_int res = clGetPlatformInfo(PlatformIDs[0], CL_PLATFORM_NAME, 64, + platformName, &nameLen); + if (res != CL_SUCCESS) { + fprintf(stderr, "Err: Failed to Get Platform Info\n"); + return; + } + platformName[nameLen] = 0; + + GetDeviceInfo(); +} + +template +void Device::DisplayDeviceInfo(cl_device_id id, cl_device_info name, + std::string str) { + cl_int err; + std::size_t paramValueSize; + + err = clGetDeviceInfo(id, name, 0, NULL, ¶mValueSize); + if (err != CL_SUCCESS) { + LOG(ERROR) << "Failed to find OpenCL device info:" << str; + return; + } + + std::string content; + T * info = (T *) alloca(sizeof(T) * paramValueSize); + err = clGetDeviceInfo(id, name, paramValueSize, info, NULL); + if (err != CL_SUCCESS) { + LOG(ERROR) << "Failed to find OpenCL device info:" << str; + return; + } + + switch (name) { + case CL_DEVICE_TYPE: { + std::string deviceType; + appendBitfield < cl_device_type + > (*(reinterpret_cast(info)), CL_DEVICE_TYPE_CPU, "CL_DEVICE_TYPE_CPU", deviceType); + + appendBitfield < cl_device_type + > (*(reinterpret_cast(info)), CL_DEVICE_TYPE_GPU, "CL_DEVICE_TYPE_GPU", deviceType); + + appendBitfield < cl_device_type + > (*(reinterpret_cast(info)), CL_DEVICE_TYPE_ACCELERATOR, "CL_DEVICE_TYPE_ACCELERATOR", deviceType); + + appendBitfield < cl_device_type + > (*(reinterpret_cast(info)), CL_DEVICE_TYPE_DEFAULT, "CL_DEVICE_TYPE_DEFAULT", deviceType); + + LOG(INFO) << "\t " << str << ":\t" << deviceType; + } + break; + case CL_DEVICE_EXECUTION_CAPABILITIES: { + std::string memType; + appendBitfield < cl_device_exec_capabilities + > (*(reinterpret_cast(info)), CL_EXEC_KERNEL, "CL_EXEC_KERNEL", memType); + + appendBitfield < cl_device_exec_capabilities + > (*(reinterpret_cast(info)), CL_EXEC_NATIVE_KERNEL, "CL_EXEC_NATIVE_KERNEL", memType); + + LOG(INFO) << "\t " << str << ":\t" << memType; + + } + break; + case CL_DEVICE_QUEUE_PROPERTIES: { + std::string memType; + appendBitfield < cl_device_exec_capabilities + > (*(reinterpret_cast(info)), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", memType); + + appendBitfield < cl_device_exec_capabilities + > (*(reinterpret_cast(info)), CL_QUEUE_PROFILING_ENABLE, "CL_QUEUE_PROFILING_ENABLE", memType); + + LOG(INFO) << "\t " << str << ":\t" << memType; + } + break; + default: + LOG(INFO) << "\t" << str << ":\t" << *info; + break; + } + +} + +template +void Device::appendBitfield(T info, T value, std::string name, + std::string &str) { + if (info & value) { + if (str.length() > 0) { + str.append(" | "); + } + str.append(name); + } +} + +#endif +} // namespace caffe + diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp index c2d19d43..fb512847 100644 --- a/src/caffe/internal_thread.cpp +++ b/src/caffe/internal_thread.cpp @@ -11,7 +11,6 @@ bool InternalThread::is_started() const { return thread_.get() != NULL && thread_->joinable(); } - bool InternalThread::StartInternalThread() { if (!WaitForInternalThreadToExit()) { return false; diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp index 926c7d8f..44233c98 100644 --- a/src/caffe/layer_factory.cpp +++ b/src/caffe/layer_factory.cpp @@ -18,8 +18,7 @@ namespace caffe { // Get convolution layer according to engine. template -shared_ptr > GetConvolutionLayer( - const LayerParameter& param) { +shared_ptr > GetConvolutionLayer(const LayerParameter& param) { ConvolutionParameter_Engine engine = param.convolution_param().engine(); if (engine == ConvolutionParameter_Engine_DEFAULT) { engine = ConvolutionParameter_Engine_CAFFE; @@ -28,7 +27,7 @@ shared_ptr > GetConvolutionLayer( #endif } if (engine == ConvolutionParameter_Engine_CAFFE) { - return shared_ptr >(new ConvolutionLayer(param)); + return shared_ptr < Layer > (new ConvolutionLayer(param)); #ifdef USE_CUDNN } else if (engine == ConvolutionParameter_Engine_CUDNN) { return shared_ptr >(new CuDNNConvolutionLayer(param)); @@ -51,14 +50,14 @@ shared_ptr > GetPoolingLayer(const LayerParameter& param) { #endif } if (engine == PoolingParameter_Engine_CAFFE) { - return shared_ptr >(new PoolingLayer(param)); + return shared_ptr < Layer > (new PoolingLayer(param)); #ifdef USE_CUDNN } else if (engine == PoolingParameter_Engine_CUDNN) { PoolingParameter p_param = param.pooling_param(); if (p_param.pad() || p_param.pad_h() || p_param.pad_w() || param.top_size() > 1) { LOG(INFO) << "CUDNN does not support padding or multiple tops. " - << "Using Caffe's own pooling layer."; + << "Using Caffe's own pooling layer."; return shared_ptr >(new PoolingLayer(param)); } return shared_ptr >(new CuDNNPoolingLayer(param)); @@ -81,7 +80,7 @@ shared_ptr > GetReLULayer(const LayerParameter& param) { #endif } if (engine == ReLUParameter_Engine_CAFFE) { - return shared_ptr >(new ReLULayer(param)); + return shared_ptr < Layer > (new ReLULayer(param)); #ifdef USE_CUDNN } else if (engine == ReLUParameter_Engine_CUDNN) { return shared_ptr >(new CuDNNReLULayer(param)); @@ -104,7 +103,7 @@ shared_ptr > GetSigmoidLayer(const LayerParameter& param) { #endif } if (engine == SigmoidParameter_Engine_CAFFE) { - return shared_ptr >(new SigmoidLayer(param)); + return shared_ptr < Layer > (new SigmoidLayer(param)); #ifdef USE_CUDNN } else if (engine == SigmoidParameter_Engine_CUDNN) { return shared_ptr >(new CuDNNSigmoidLayer(param)); @@ -127,7 +126,7 @@ shared_ptr > GetSoftmaxLayer(const LayerParameter& param) { #endif } if (engine == SoftmaxParameter_Engine_CAFFE) { - return shared_ptr >(new SoftmaxLayer(param)); + return shared_ptr < Layer > (new SoftmaxLayer(param)); #ifdef USE_CUDNN } else if (engine == SoftmaxParameter_Engine_CUDNN) { return shared_ptr >(new CuDNNSoftmaxLayer(param)); @@ -150,7 +149,7 @@ shared_ptr > GetTanHLayer(const LayerParameter& param) { #endif } if (engine == TanHParameter_Engine_CAFFE) { - return shared_ptr >(new TanHLayer(param)); + return shared_ptr < Layer > (new TanHLayer(param)); #ifdef USE_CUDNN } else if (engine == TanHParameter_Engine_CUDNN) { return shared_ptr >(new CuDNNTanHLayer(param)); @@ -181,4 +180,5 @@ REGISTER_LAYER_CREATOR(Python, GetPythonLayer); // Layers that use their constructor as their default creator should be // registered in their corresponding cpp files. Do not register them here. -} // namespace caffe +} + // namespace caffe diff --git a/src/caffe/layers/absval_layer.cpp b/src/caffe/layers/absval_layer.cpp index 5ce28c9e..20898f15 100644 --- a/src/caffe/layers/absval_layer.cpp +++ b/src/caffe/layers/absval_layer.cpp @@ -8,15 +8,15 @@ namespace caffe { template void AbsValLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not " - "allow in-place computation."; + "allow in-place computation."; } template -void AbsValLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { +void AbsValLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { const int count = top[0]->count(); Dtype* top_data = top[0]->mutable_cpu_data(); caffe_abs(count, bottom[0]->cpu_data(), top_data); @@ -35,11 +35,35 @@ void AbsValLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void AbsValLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_gpu_data(); + caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); +} + +template +void AbsValLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + const int count = top[0]->count(); + const Dtype* top_diff = top[0]->gpu_diff(); + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_sign(count, bottom_data, bottom_diff); + caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(AbsValLayer); #endif -INSTANTIATE_CLASS(AbsValLayer); -REGISTER_LAYER_CLASS(AbsVal); +INSTANTIATE_CLASS (AbsValLayer); +REGISTER_LAYER_CLASS (AbsVal); } // namespace caffe diff --git a/src/caffe/layers/absval_layer.cu b/src/caffe/layers/absval_layer.cu deleted file mode 100644 index bb310e1a..00000000 --- a/src/caffe/layers/absval_layer.cu +++ /dev/null @@ -1,33 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void AbsValLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_gpu_data(); - caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data); -} - -template -void AbsValLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int count = top[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_sign(count, bottom_data, bottom_diff); - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(AbsValLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp index 90aad675..4cfc96f8 100644 --- a/src/caffe/layers/accuracy_layer.cpp +++ b/src/caffe/layers/accuracy_layer.cpp @@ -11,24 +11,23 @@ namespace caffe { template -void AccuracyLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { +void AccuracyLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { top_k_ = this->layer_param_.accuracy_param().top_k(); - has_ignore_label_ = - this->layer_param_.accuracy_param().has_ignore_label(); + has_ignore_label_ = this->layer_param_.accuracy_param().has_ignore_label(); if (has_ignore_label_) { ignore_label_ = this->layer_param_.accuracy_param().ignore_label(); } } template -void AccuracyLayer::Reshape( - const vector*>& bottom, const vector*>& top) { +void AccuracyLayer::Reshape(const vector*>& bottom, + const vector*>& top) { CHECK_LE(top_k_, bottom[0]->count() / bottom[1]->count()) << "top_k must be less than or equal to the number of classes."; - label_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.accuracy_param().axis()); + label_axis_ = bottom[0]->CanonicalAxisIndex( + this->layer_param_.accuracy_param().axis()); outer_num_ = bottom[0]->count(0, label_axis_); inner_num_ = bottom[0]->count(label_axis_ + 1); CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) @@ -48,27 +47,26 @@ void AccuracyLayer::Forward_cpu(const vector*>& bottom, const Dtype* bottom_label = bottom[1]->cpu_data(); const int dim = bottom[0]->count() / outer_num_; const int num_labels = bottom[0]->shape(label_axis_); - vector maxval(top_k_+1); - vector max_id(top_k_+1); + vector < Dtype > maxval(top_k_ + 1); + vector max_id(top_k_ + 1); int count = 0; for (int i = 0; i < outer_num_; ++i) { for (int j = 0; j < inner_num_; ++j) { - const int label_value = - static_cast(bottom_label[i * inner_num_ + j]); + const int label_value = static_cast(bottom_label[i * inner_num_ + j]); if (has_ignore_label_ && label_value == ignore_label_) { continue; } DCHECK_GE(label_value, 0); DCHECK_LT(label_value, num_labels); // Top-k accuracy - std::vector > bottom_data_vector; + std::vector < std::pair > bottom_data_vector; for (int k = 0; k < num_labels; ++k) { - bottom_data_vector.push_back(std::make_pair( - bottom_data[i * dim + k * inner_num_ + j], k)); + bottom_data_vector.push_back( + std::make_pair(bottom_data[i * dim + k * inner_num_ + j], k)); } - std::partial_sort( - bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, - bottom_data_vector.end(), std::greater >()); + std::partial_sort(bottom_data_vector.begin(), + bottom_data_vector.begin() + top_k_, bottom_data_vector.end(), + std::greater >()); // check if true label is in top k predictions for (int k = 0; k < top_k_; k++) { if (bottom_data_vector[k].second == label_value) { @@ -85,7 +83,7 @@ void AccuracyLayer::Forward_cpu(const vector*>& bottom, // Accuracy layer should not be used as a loss function. } -INSTANTIATE_CLASS(AccuracyLayer); -REGISTER_LAYER_CLASS(Accuracy); +INSTANTIATE_CLASS (AccuracyLayer); +REGISTER_LAYER_CLASS (Accuracy); } // namespace caffe diff --git a/src/caffe/layers/argmax_layer.cpp b/src/caffe/layers/argmax_layer.cpp index c4040cdc..7b37283d 100644 --- a/src/caffe/layers/argmax_layer.cpp +++ b/src/caffe/layers/argmax_layer.cpp @@ -10,7 +10,7 @@ namespace caffe { template void ArgMaxLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { out_max_val_ = this->layer_param_.argmax_param().out_max_val(); top_k_ = this->layer_param_.argmax_param().top_k(); CHECK_GE(top_k_, 1) << " top k must not be less than 1."; @@ -20,7 +20,7 @@ void ArgMaxLayer::LayerSetUp(const vector*>& bottom, template void ArgMaxLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { if (out_max_val_) { // Produces max_ind and max_val top[0]->Reshape(bottom[0]->num(), 2, top_k_, 1); @@ -38,14 +38,13 @@ void ArgMaxLayer::Forward_cpu(const vector*>& bottom, int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); for (int i = 0; i < num; ++i) { - std::vector > bottom_data_vector; + std::vector < std::pair > bottom_data_vector; for (int j = 0; j < dim; ++j) { - bottom_data_vector.push_back( - std::make_pair(bottom_data[i * dim + j], j)); + bottom_data_vector.push_back(std::make_pair(bottom_data[i * dim + j], j)); } - std::partial_sort( - bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_, - bottom_data_vector.end(), std::greater >()); + std::partial_sort(bottom_data_vector.begin(), + bottom_data_vector.begin() + top_k_, bottom_data_vector.end(), + std::greater >()); for (int j = 0; j < top_k_; ++j) { top_data[top[0]->offset(i, 0, j)] = bottom_data_vector[j].second; } @@ -57,7 +56,7 @@ void ArgMaxLayer::Forward_cpu(const vector*>& bottom, } } -INSTANTIATE_CLASS(ArgMaxLayer); -REGISTER_LAYER_CLASS(ArgMax); +INSTANTIATE_CLASS (ArgMaxLayer); +REGISTER_LAYER_CLASS (ArgMax); } // namespace caffe diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp index ccb3adc7..5d99e04d 100644 --- a/src/caffe/layers/base_conv_layer.cpp +++ b/src/caffe/layers/base_conv_layer.cpp @@ -5,29 +5,77 @@ #include "caffe/util/im2col.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/vision_layers.hpp" +#include "caffe/common.hpp" namespace caffe { +#ifndef CPU_ONLY +#ifdef use_packing_scheme +template size_t BaseConvolutionLayer::subtop_mem_size = sizeof(Dtype); +template size_t BaseConvolutionLayer::trans_mem_size = sizeof(Dtype); +template cl_mem BaseConvolutionLayer::subTopMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer::subtop_mem_size, NULL, NULL); +template cl_mem BaseConvolutionLayer::transMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, BaseConvolutionLayer::trans_mem_size, NULL, NULL); +#endif + +template +void Alloc_public_tmp_mem(size_t subtop_size, size_t trans_size) { + if (subtop_size > BaseConvolutionLayer < Dtype > ::subtop_mem_size) { + ConvolutionLayer < Dtype > ::subtop_mem_size = subtop_size; + clReleaseMemObject(ConvolutionLayer < Dtype > ::subTopMem); + ConvolutionLayer < Dtype > ::subTopMem = clCreateBuffer(amdDevice.Context, + CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::subtop_mem_size, + NULL, NULL); + } + if (trans_size > ConvolutionLayer < Dtype > ::trans_mem_size) { + ConvolutionLayer < Dtype > ::trans_mem_size = trans_size; + clReleaseMemObject(ConvolutionLayer < Dtype > ::transMem); + ConvolutionLayer < Dtype > ::transMem = clCreateBuffer(amdDevice.Context, + CL_MEM_READ_WRITE, BaseConvolutionLayer < Dtype > ::trans_mem_size, + NULL, NULL); + } +} + +template +void BaseConvolutionLayer::ocl_setup() { + M_ = num_output_ / group_; + K_ = conv_in_channels_ * kernel_w_ * kernel_h_ / group_; + N_ = height_out_ * width_out_; +#ifdef use_packing_scheme + size_t subtop_size = (size_t)((M_ * group_) * N_ * global_packing_N * sizeof(Dtype)); + size_t trans_size = (size_t)((K_ * group_ )* N_ * global_packing_N * sizeof(Dtype)); + Alloc_public_tmp_mem(subtop_size, trans_size); +#endif +} + +#endif + +template +BaseConvolutionLayer::~BaseConvolutionLayer() { +} + template void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " << "corresponding to (num, channels, height, width)"; // Configure the kernel size, padding, stride, and inputs. ConvolutionParameter conv_param = this->layer_param_.convolution_param(); - CHECK(!conv_param.has_kernel_size() != - !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) + CHECK( + !conv_param.has_kernel_size() + != !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; - CHECK(conv_param.has_kernel_size() || - (conv_param.has_kernel_h() && conv_param.has_kernel_w())) + CHECK( + conv_param.has_kernel_size() + || (conv_param.has_kernel_h() && conv_param.has_kernel_w())) << "For non-square filters both kernel_h and kernel_w are required."; - CHECK((!conv_param.has_pad() && conv_param.has_pad_h() - && conv_param.has_pad_w()) - || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) + CHECK( + (!conv_param.has_pad() && conv_param.has_pad_h() && conv_param.has_pad_w()) + || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) << "pad is pad OR pad_h and pad_w are required."; - CHECK((!conv_param.has_stride() && conv_param.has_stride_h() - && conv_param.has_stride_w()) - || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) + CHECK( + (!conv_param.has_stride() && conv_param.has_stride_h() + && conv_param.has_stride_w()) + || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) << "Stride is stride OR stride_h and stride_w are required."; if (conv_param.has_kernel_size()) { kernel_h_ = kernel_w_ = conv_param.kernel_size(); @@ -51,8 +99,8 @@ void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, } // Special case: im2col is the identity for 1x1 convolution with stride 1 // and no padding, so flag for skipping the buffer and transformation. - is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1 - && stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0; + is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1 && stride_h_ == 1 && stride_w_ == 1 + && pad_h_ == 0 && pad_w_ == 0; // Configure output channels and groups. channels_ = bottom[0]->channels(); num_output_ = this->layer_param_.convolution_param().num_output(); @@ -68,6 +116,7 @@ void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, conv_out_channels_ = num_output_; conv_in_channels_ = channels_; } + // Handle the parameters: weights and biases. // - blobs_[0] holds the filter weights // - blobs_[1] holds the biases (optional) @@ -82,17 +131,22 @@ void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, } // Initialize and fill the weights: // output channels x input channels per-group x kernel height x kernel width - this->blobs_[0].reset(new Blob( - conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_)); - shared_ptr > weight_filler(GetFiller( - this->layer_param_.convolution_param().weight_filler())); + this->blobs_[0].reset( + new Blob(conv_out_channels_, conv_in_channels_ / group_, + kernel_h_, kernel_w_)); + shared_ptr < Filler + > weight_filler( + GetFiller < Dtype + > (this->layer_param_.convolution_param().weight_filler())); weight_filler->Fill(this->blobs_[0].get()); // If necessary, initialize and fill the biases. if (bias_term_) { vector bias_shape(1, num_output_); this->blobs_[1].reset(new Blob(bias_shape)); - shared_ptr > bias_filler(GetFiller( - this->layer_param_.convolution_param().bias_filler())); + shared_ptr < Filler + > bias_filler( + GetFiller < Dtype + > (this->layer_param_.convolution_param().bias_filler())); bias_filler->Fill(this->blobs_[1].get()); } } @@ -102,14 +156,14 @@ void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, template void BaseConvolutionLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " << "corresponding to (num, channels, height, width)"; num_ = bottom[0]->num(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with" - " convolution kernel."; + " convolution kernel."; // TODO: generalize to handle inputs of different shapes. for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) { CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num."; @@ -153,6 +207,10 @@ void BaseConvolutionLayer::Reshape(const vector*>& bottom, caffe_set(bias_multiplier_.count(), Dtype(1), bias_multiplier_.mutable_cpu_data()); } +#ifndef CPU_ONLY + //initializa OpenCL kernels and cl_mem objects + ocl_setup(); +#endif } template @@ -166,19 +224,17 @@ void BaseConvolutionLayer::forward_cpu_gemm(const Dtype* input, col_buff = col_buffer_.cpu_data(); } for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, conv_out_channels_ / - group_, conv_out_spatial_dim_, kernel_dim_ / group_, - (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g, - (Dtype)0., output + output_offset_ * g); + caffe_cpu_gemm (CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_ + / group_, (Dtype) 1., weights + weight_offset_ * g, col_buff + + col_offset_ * g, (Dtype) 0., output + output_offset_ * g); } } template void BaseConvolutionLayer::forward_cpu_bias(Dtype* output, const Dtype* bias) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(), - (Dtype)1., output); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num_output_, height_out_ * width_out_, 1, (Dtype) 1., bias, bias_multiplier_.cpu_data(), (Dtype) 1., output); } template @@ -189,10 +245,10 @@ void BaseConvolutionLayer::backward_cpu_gemm(const Dtype* output, col_buff = input; } for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasTrans, CblasNoTrans, kernel_dim_ / group_, - conv_out_spatial_dim_, conv_out_channels_ / group_, - (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g, - (Dtype)0., col_buff + col_offset_ * g); + caffe_cpu_gemm < Dtype + > (CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ + / group_, (Dtype) 1., weights + weight_offset_ * g, output + + output_offset_ * g, (Dtype) 0., col_buff + col_offset_ * g); } if (!is_1x1_) { conv_col2im_cpu(col_buff, input); @@ -208,18 +264,19 @@ void BaseConvolutionLayer::weight_cpu_gemm(const Dtype* input, col_buff = col_buffer_.cpu_data(); } for (int g = 0; g < group_; ++g) { - caffe_cpu_gemm(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, - kernel_dim_ / group_, conv_out_spatial_dim_, - (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, - (Dtype)1., weights + weight_offset_ * g); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_ + / group_, conv_out_spatial_dim_, (Dtype) 1., output + + output_offset_ * g, col_buff + col_offset_ * g, (Dtype) 1., weights + + weight_offset_ * g); } } template void BaseConvolutionLayer::backward_cpu_bias(Dtype* bias, const Dtype* input) { - caffe_cpu_gemv(CblasNoTrans, num_output_, height_out_ * width_out_, 1., - input, bias_multiplier_.cpu_data(), 1., bias); + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num_output_, height_out_ * width_out_, 1., input, bias_multiplier_.cpu_data(), 1., bias); } #ifndef CPU_ONLY @@ -233,21 +290,21 @@ void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); } col_buff = col_buffer_.gpu_data(); - } + } for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, conv_out_channels_ / - group_, conv_out_spatial_dim_, kernel_dim_ / group_, - (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g, - (Dtype)0., output + output_offset_ * g); + caffe_gpu_gemm < Dtype > (&(amdDevice.CommandQueue), CblasNoTrans, CblasNoTrans, conv_out_channels_ + / group_, conv_out_spatial_dim_, kernel_dim_ / group_, (Dtype) 1., weights, weight_offset_ + * g, col_buff, is_1x1_ * bottom_offset_ + col_offset_ * g, (Dtype) 0., output, top_offset_ + + output_offset_ * g); } + } template void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, const Dtype* bias) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, - height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(), - (Dtype)1., output); + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num_output_, height_out_ * width_out_, 1, (Dtype) 1., bias, 0, reinterpret_cast(bias_multiplier_.gpu_data()), 0, (Dtype) 1., output, top_offset_); } template @@ -257,12 +314,14 @@ void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, if (is_1x1_) { col_buff = input; } + for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasTrans, CblasNoTrans, kernel_dim_ / group_, - conv_out_spatial_dim_, conv_out_channels_ / group_, - (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g, - (Dtype)0., col_buff + col_offset_ * g); + caffe_gpu_gemm < Dtype> (&(amdDevice.CommandQueue), CblasTrans, CblasNoTrans, kernel_dim_ + / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype) 1., weights, weight_offset_ + * g, output, top_offset_ + output_offset_ * g, (Dtype) 0., col_buff, is_1x1_ * bottom_offset_ + col_offset_ + * g); } + if (!is_1x1_) { conv_col2im_gpu(col_buff, input); } @@ -277,22 +336,118 @@ void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, col_buff = col_buffer_.gpu_data(); } for (int g = 0; g < group_; ++g) { - caffe_gpu_gemm(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, - kernel_dim_ / group_, conv_out_spatial_dim_, - (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, - (Dtype)1., weights + weight_offset_ * g); + caffe_gpu_gemm < Dtype + > (&(amdDevice.CommandQueue), CblasNoTrans, CblasTrans, conv_out_channels_ + / group_, kernel_dim_ / group_, conv_out_spatial_dim_, (Dtype) 1., output, top_offset_ + output_offset_*g, (Dtype*) col_buff, is_1x1_*bottom_offset_ + col_offset_ * g, (Dtype) 1., (Dtype*) weights, weight_offset_ * g); } } template void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, const Dtype* input) { - caffe_gpu_gemv(CblasNoTrans, num_output_, height_out_ * width_out_, 1., - input, bias_multiplier_.gpu_data(), 1., bias); + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num_output_, N_, (Dtype) 1., input, top_offset_, N_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 1., 1, bias, (size_t) 0, 1); +} + +// begin: code modified for OpenCL port +template +void BaseConvolutionLayer::forward_gpu_gemm_opt(const Dtype* input, + const Dtype* weight, Dtype* output, bool skip_im2col) { + cl_command_queue Queue; + if (!skip_im2col) { + conv_im2col_gpu_opt(input); + } +#ifdef multiQ + for (int g = 0; g < group_; ++g) { + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; + caffe_gpu_gemm(&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, + (Dtype)1., weight, weight_offset_ * g, (Dtype*)transMem, col_offset_ * g, + (Dtype)0., (Dtype*)subTopMem, top_offset_opt * g); + } + if(group_ == 2) { + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } +#else + Queue = amdDevice.CommandQueue; + for (int g = 0; g < group_; ++g) { + caffe_gpu_gemm < Dtype + > (&(Queue), CblasNoTrans, CblasNoTrans, M_, N_ * opt_num2, K_, (Dtype) 1., weight, weight_offset_ + * g, (Dtype*) transMem, col_offset_ * g, (Dtype) 0., (Dtype*) subTopMem, top_offset_opt + * g); + } +#endif + transform_gpu((Dtype*) subTopMem, output, top_offset_, N_, M_ * group_, + opt_num2); +} + +template +void BaseConvolutionLayer::forward_gpu_bias_opt(Dtype* output, + const Dtype* bias) { + for (int z = 0; z < opt_num2; z++) + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num_output_, N_, 1, (Dtype) 1., bias, 0, reinterpret_cast(bias_multiplier_.gpu_data()), 0, (Dtype) 1., output, top_offset_ + + num_output_ * N_ * z); +} + +template +void BaseConvolutionLayer::backward_gpu_gemm_opt(const Dtype* output, + const Dtype* weights, Dtype* input) { + cl_command_queue Queue; + for (int g = 0; g < group_; ++g) { +#ifdef multiQ + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; +#else + Queue = amdDevice.CommandQueue; +#endif + caffe_gpu_gemm < Dtype + > (&(Queue), CblasTrans, CblasNoTrans, K_, N_ * opt_num2, M_, (Dtype) 1., weights, weight_offset_ + * g, (Dtype*) subTopMem, top_offset_opt * g, (Dtype) 0., (Dtype*) transMem, col_offset_ + * g); + } +#ifdef multiQ + if(group_ ==2) { + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } +#endif + + conv_col2im_gpu_opt(input); +} + +template +void BaseConvolutionLayer::weight_gpu_gemm_opt(const Dtype* input, + const Dtype* output, Dtype* weights) { + cl_command_queue Queue; + conv_im2col_gpu_opt(input); + opttrans(output, top_offset_, 1, M_ * group_, N_, (Dtype*) subTopMem, 0, + opt_num2); + + for (int g = 0; g < group_; ++g) { +#ifdef multiQ + if(g == 0) Queue = amdDevice.CommandQueue; + else Queue = amdDevice.CommandQueue_helper; +#else + Queue = amdDevice.CommandQueue; +#endif + caffe_gpu_gemm < Dtype + > (&(Queue), CblasNoTrans, CblasTrans, M_, K_, N_ * opt_num2, (Dtype) 1., (Dtype*) subTopMem, top_offset_opt + * g, (Dtype*) transMem, col_offset_ * g, (Dtype) 1., (Dtype*) weights, weight_offset_ + * g); +#ifdef multiQ + if(group_ == 2) { + clFinish(amdDevice.CommandQueue); + clFinish(amdDevice.CommandQueue_helper); + } +#endif + } } +// end: code is modified for OpenCL #endif // !CPU_ONLY -INSTANTIATE_CLASS(BaseConvolutionLayer); +INSTANTIATE_CLASS (BaseConvolutionLayer); } // namespace caffe diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp index 26a11182..ff4436a7 100644 --- a/src/caffe/layers/base_data_layer.cpp +++ b/src/caffe/layers/base_data_layer.cpp @@ -3,18 +3,18 @@ #include "caffe/data_layers.hpp" #include "caffe/util/io.hpp" +#include "caffe/util/benchmark.hpp" namespace caffe { template BaseDataLayer::BaseDataLayer(const LayerParameter& param) - : Layer(param), - transform_param_(param.transform_param()) { + : Layer(param), transform_param_(param.transform_param()) { } template void BaseDataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { if (top.size() == 1) { output_labels_ = false; } else { @@ -30,7 +30,7 @@ void BaseDataLayer::LayerSetUp(const vector*>& bottom, template void BasePrefetchingDataLayer::LayerSetUp( const vector*>& bottom, const vector*>& top) { - BaseDataLayer::LayerSetUp(bottom, top); + BaseDataLayer < Dtype > ::LayerSetUp(bottom, top); // Now, start the prefetch thread. Before calling prefetch, we make two // cpu_data calls so that the prefetch thread does not accidentally make // simultaneous cudaMalloc calls when the main thread is running. In some @@ -60,30 +60,62 @@ void BasePrefetchingDataLayer::Forward_cpu( const vector*>& bottom, const vector*>& top) { // First, join the thread JoinPrefetchThread(); + DLOG(INFO) << "Thread joined"; // Reshape to loaded data. top[0]->ReshapeLike(prefetch_data_); // Copy the data caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(), - top[0]->mutable_cpu_data()); + top[0]->mutable_cpu_data()); DLOG(INFO) << "Prefetch copied"; if (this->output_labels_) { // Reshape to loaded labels. top[1]->ReshapeLike(prefetch_label_); // Copy the labels. caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), - top[1]->mutable_cpu_data()); + top[1]->mutable_cpu_data()); } // Start a new prefetch thread DLOG(INFO) << "CreatePrefetchThread"; CreatePrefetchThread(); } -#ifdef CPU_ONLY +#ifndef CPU_ONLY + +template +void BasePrefetchingDataLayer::Forward_gpu( + const vector*>& bottom, const vector*>& top) { + + JoinPrefetchThread(); + DLOG(INFO) << "Thread joined"; + + top[0]->ReshapeLike(this->prefetch_data_); + OCL_CHECK( + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[0]->mutable_gpu_data(), CL_TRUE, 0, + sizeof(Dtype) * prefetch_data_.count(), prefetch_data_.cpu_data(), 0, + NULL, NULL)); + DLOG(INFO) << "Prefetch copied"; + if (this->output_labels_) { + // Reshape to loaded labels. + top[1]->ReshapeLike(prefetch_label_); + OCL_CHECK( + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[1]->mutable_gpu_data(), CL_TRUE, 0, + sizeof(Dtype) * prefetch_label_.count(), prefetch_label_.cpu_data(), + 0, NULL, NULL)); + } + + // Start a new prefetch thread + DLOG(INFO) << "CreatePrefetchThread"; + CreatePrefetchThread(); +} + +#else STUB_GPU_FORWARD(BasePrefetchingDataLayer, Forward); #endif -INSTANTIATE_CLASS(BaseDataLayer); -INSTANTIATE_CLASS(BasePrefetchingDataLayer); +INSTANTIATE_CLASS (BaseDataLayer); +INSTANTIATE_CLASS (BasePrefetchingDataLayer); } // namespace caffe diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/base_data_layer.cu deleted file mode 100644 index 9335a5bc..00000000 --- a/src/caffe/layers/base_data_layer.cu +++ /dev/null @@ -1,30 +0,0 @@ -#include - -#include "caffe/data_layers.hpp" - -namespace caffe { - -template -void BasePrefetchingDataLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - // First, join the thread - JoinPrefetchThread(); - // Reshape to loaded data. - top[0]->ReshapeLike(this->prefetch_data_); - // Copy the data - caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(), - top[0]->mutable_gpu_data()); - if (this->output_labels_) { - // Reshape to loaded labels. - top[1]->ReshapeLike(prefetch_label_); - // Copy the labels. - caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(), - top[1]->mutable_gpu_data()); - } - // Start a new prefetch thread - CreatePrefetchThread(); -} - -INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer); - -} // namespace caffe diff --git a/src/caffe/layers/bnll_layer.cpp b/src/caffe/layers/bnll_layer.cpp index 9ba0ea9a..68a19265 100644 --- a/src/caffe/layers/bnll_layer.cpp +++ b/src/caffe/layers/bnll_layer.cpp @@ -15,16 +15,16 @@ void BNLLLayer::Forward_cpu(const vector*>& bottom, Dtype* top_data = top[0]->mutable_cpu_data(); const int count = bottom[0]->count(); for (int i = 0; i < count; ++i) { - top_data[i] = bottom_data[i] > 0 ? - bottom_data[i] + log(1. + exp(-bottom_data[i])) : - log(1. + exp(bottom_data[i])); + top_data[i] = + bottom_data[i] > 0 ? + bottom_data[i] + log(1. + exp(-bottom_data[i])) : + log(1. + exp(bottom_data[i])); } } template void BNLLLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -38,11 +38,37 @@ void BNLLLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void BNLLLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + BNLLForward(count, bottom_data, top_data); +} + +template +void BNLLLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + BNLLBackward(count, top_diff, bottom_data, bottom_diff); + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(BNLLLayer); #endif -INSTANTIATE_CLASS(BNLLLayer); -REGISTER_LAYER_CLASS(BNLL); +INSTANTIATE_CLASS (BNLLLayer); +REGISTER_LAYER_CLASS (BNLL); } // namespace caffe diff --git a/src/caffe/layers/bnll_layer.cu b/src/caffe/layers/bnll_layer.cu deleted file mode 100644 index d963d068..00000000 --- a/src/caffe/layers/bnll_layer.cu +++ /dev/null @@ -1,60 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -const float kBNLL_THRESHOLD = 50.; - -template -__global__ void BNLLForward(const int n, const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] > 0 ? - in[index] + log(1. + exp(-in[index])) : - log(1. + exp(in[index])); - } -} - -template -void BNLLLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - BNLLForward<<>>( - count, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; -} - -template -__global__ void BNLLBackward(const int n, const Dtype* in_diff, - const Dtype* in_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - Dtype expval = exp(min(in_data[index], Dtype(kBNLL_THRESHOLD))); - out_diff[index] = in_diff[index] * expval / (expval + 1.); - } -} - -template -void BNLLLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - BNLLBackward<<>>( - count, top_diff, bottom_data, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(BNLLLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp index 1cac8fc3..5def30d4 100644 --- a/src/caffe/layers/concat_layer.cpp +++ b/src/caffe/layers/concat_layer.cpp @@ -8,7 +8,7 @@ namespace caffe { template void ConcatLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const ConcatParameter& concat_param = this->layer_param_.concat_param(); CHECK(!(concat_param.has_axis() && concat_param.has_concat_dim())) << "Either axis or concat_dim should be specified; not both."; @@ -16,7 +16,7 @@ void ConcatLayer::LayerSetUp(const vector*>& bottom, template void ConcatLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int num_axes = bottom[0]->num_axes(); const ConcatParameter& concat_param = this->layer_param_.concat_param(); if (concat_param.has_concat_dim()) { @@ -39,7 +39,9 @@ void ConcatLayer::Reshape(const vector*>& bottom, CHECK_EQ(num_axes, bottom[i]->num_axes()) << "All inputs must have the same #axes."; for (int j = 0; j < num_axes; ++j) { - if (j == concat_axis_) { continue; } + if (j == concat_axis_) { + continue; + } CHECK_EQ(top_shape[j], bottom[i]->shape(j)) << "All inputs must have the same shape, except at concat_axis."; } @@ -52,7 +54,7 @@ void ConcatLayer::Reshape(const vector*>& bottom, template void ConcatLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { Dtype* top_data = top[0]->mutable_cpu_data(); int offset_concat_axis = 0; const int top_concat_axis = top[0]->shape(concat_axis_); @@ -62,8 +64,9 @@ void ConcatLayer::Forward_cpu(const vector*>& bottom, for (int n = 0; n < num_concats_; ++n) { caffe_copy(bottom_concat_axis * concat_input_size_, bottom_data + n * bottom_concat_axis * concat_input_size_, - top_data + (n * top_concat_axis + offset_concat_axis) - * concat_input_size_); + top_data + + (n * top_concat_axis + offset_concat_axis) + * concat_input_size_); } offset_concat_axis += bottom_concat_axis; } @@ -71,28 +74,78 @@ void ConcatLayer::Forward_cpu(const vector*>& bottom, template void ConcatLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); int offset_concat_axis = 0; const int top_concat_axis = top[0]->shape(concat_axis_); for (int i = 0; i < bottom.size(); ++i) { - if (!propagate_down[i]) { continue; } + if (!propagate_down[i]) { + continue; + } Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); const int bottom_concat_axis = bottom[i]->shape(concat_axis_); for (int n = 0; n < num_concats_; ++n) { - caffe_copy(bottom_concat_axis * concat_input_size_, top_diff + - (n * top_concat_axis + offset_concat_axis) * concat_input_size_, + caffe_copy(bottom_concat_axis * concat_input_size_, + top_diff + + (n * top_concat_axis + offset_concat_axis) * concat_input_size_, bottom_diff + n * bottom_concat_axis * concat_input_size_); } offset_concat_axis += bottom_concat_axis; } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void ConcatLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + if (bottom.size() == 1) { + return; + } + Dtype* top_data = top[0]->mutable_gpu_data(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + const bool kForward = true; + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + const int bottom_concat_size = bottom_concat_axis * concat_input_size_; + const int nthreads = bottom_concat_size * num_concats_; + Concat(nthreads, bottom_data, kForward, num_concats_, concat_input_size_, + top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); + offset_concat_axis += bottom_concat_axis; + } +} + +template +void ConcatLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (bottom.size() == 1) { + return; + } + const Dtype* top_diff = top[0]->gpu_diff(); + int offset_concat_axis = 0; + const int top_concat_axis = top[0]->shape(concat_axis_); + const bool kForward = false; + for (int i = 0; i < bottom.size(); ++i) { + const int bottom_concat_axis = bottom[i]->shape(concat_axis_); + if (propagate_down[i]) { + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + const int bottom_concat_size = bottom_concat_axis * concat_input_size_; + const int nthreads = bottom_concat_size * num_concats_; + Concat(nthreads, top_diff, kForward, num_concats_, concat_input_size_, + top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); + } + offset_concat_axis += bottom_concat_axis; + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(ConcatLayer); #endif -INSTANTIATE_CLASS(ConcatLayer); -REGISTER_LAYER_CLASS(Concat); +INSTANTIATE_CLASS (ConcatLayer); +REGISTER_LAYER_CLASS (Concat); } // namespace caffe diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu deleted file mode 100644 index 8f2e85d8..00000000 --- a/src/caffe/layers/concat_layer.cu +++ /dev/null @@ -1,71 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void Concat(const int nthreads, const Dtype* in_data, - const bool forward, const int num_concats, const int concat_size, - const int top_concat_axis, const int bottom_concat_axis, - const int offset_concat_axis, Dtype* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int total_concat_size = concat_size * bottom_concat_axis; - const int concat_num = index / total_concat_size; - const int concat_index = index % total_concat_size; - const int top_index = concat_index + - (concat_num * top_concat_axis + offset_concat_axis) * concat_size; - if (forward) { - out_data[top_index] = in_data[index]; - } else { - out_data[index] = in_data[top_index]; - } - } -} - -template -void ConcatLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - Dtype* top_data = top[0]->mutable_gpu_data(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - const bool kForward = true; - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - const int bottom_concat_size = bottom_concat_axis * concat_input_size_; - const int nthreads = bottom_concat_size * num_concats_; - Concat // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, bottom_data, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); - offset_concat_axis += bottom_concat_axis; - } -} - -template -void ConcatLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - int offset_concat_axis = 0; - const int top_concat_axis = top[0]->shape(concat_axis_); - const bool kForward = false; - for (int i = 0; i < bottom.size(); ++i) { - if (!propagate_down[i]) { continue; } - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - const int bottom_concat_axis = bottom[i]->shape(concat_axis_); - const int bottom_concat_size = bottom_concat_axis * concat_input_size_; - const int nthreads = bottom_concat_size * num_concats_; - Concat // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, top_diff, kForward, num_concats_, concat_input_size_, - top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff); - offset_concat_axis += bottom_concat_axis; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ConcatLayer); - -} // namespace caffe diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp index 25e16781..3410b927 100644 --- a/src/caffe/layers/contrastive_loss_layer.cpp +++ b/src/caffe/layers/contrastive_loss_layer.cpp @@ -9,9 +9,9 @@ namespace caffe { template -void ContrastiveLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - LossLayer::LayerSetUp(bottom, top); +void ContrastiveLossLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::LayerSetUp(bottom, top); CHECK_EQ(bottom[0]->channels(), bottom[1]->channels()); CHECK_EQ(bottom[0]->height(), 1); CHECK_EQ(bottom[0]->width(), 1); @@ -31,12 +31,9 @@ void ContrastiveLossLayer::LayerSetUp( template void ContrastiveLossLayer::Forward_cpu( - const vector*>& bottom, - const vector*>& top) { + const vector*>& bottom, const vector*>& top) { int count = bottom[0]->count(); - caffe_sub( - count, - bottom[0]->cpu_data(), // a + caffe_sub(count, bottom[0]->cpu_data(), // a bottom[1]->cpu_data(), // b diff_.mutable_cpu_data()); // a_i-b_i const int channels = bottom[0]->channels(); @@ -46,7 +43,7 @@ void ContrastiveLossLayer::Forward_cpu( Dtype loss(0.0); for (int i = 0; i < bottom[0]->num(); ++i) { dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels, - diff_.cpu_data() + (i*channels), diff_.cpu_data() + (i*channels)); + diff_.cpu_data() + (i * channels), diff_.cpu_data() + (i * channels)); if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs loss += dist_sq_.cpu_data()[i]; } else { // dissimilar pairs @@ -54,7 +51,7 @@ void ContrastiveLossLayer::Forward_cpu( loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); } else { Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); - loss += dist*dist; + loss += dist * dist; } } } @@ -71,19 +68,15 @@ void ContrastiveLossLayer::Backward_cpu(const vector*>& top, for (int i = 0; i < 2; ++i) { if (propagate_down[i]) { const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / - static_cast(bottom[i]->num()); + const Dtype alpha = sign * top[0]->cpu_diff()[0] + / static_cast(bottom[i]->num()); int num = bottom[i]->num(); int channels = bottom[i]->channels(); for (int j = 0; j < num; ++j) { Dtype* bout = bottom[i]->mutable_cpu_diff(); if (static_cast(bottom[2]->cpu_data()[j])) { // similar pairs - caffe_cpu_axpby( - channels, - alpha, - diff_.cpu_data() + (j*channels), - Dtype(0.0), - bout + (j*channels)); + caffe_cpu_axpby(channels, alpha, diff_.cpu_data() + (j * channels), + Dtype(0.0), bout + (j * channels)); } else { // dissimilar pairs Dtype mdist(0.0); Dtype beta(0.0); @@ -96,14 +89,10 @@ void ContrastiveLossLayer::Backward_cpu(const vector*>& top, beta = -alpha * mdist / (dist + Dtype(1e-4)); } if (mdist > Dtype(0.0)) { - caffe_cpu_axpby( - channels, - beta, - diff_.cpu_data() + (j*channels), - Dtype(0.0), - bout + (j*channels)); + caffe_cpu_axpby(channels, beta, diff_.cpu_data() + (j * channels), + Dtype(0.0), bout + (j * channels)); } else { - caffe_set(channels, Dtype(0), bout + (j*channels)); + caffe_set(channels, Dtype(0), bout + (j * channels)); } } } @@ -111,11 +100,69 @@ void ContrastiveLossLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +// begin: code modified for OpenCL port +#ifndef CPU_ONLY +template +void ContrastiveLossLayer::Forward_gpu( + const vector*>& bottom, const vector*>& top) { + const int count = bottom[0]->count(); + caffe_gpu_sub(count, bottom[0]->gpu_data(), // a + bottom[1]->gpu_data(), // b + diff_.mutable_gpu_data()); // a_i-b_i + caffe_gpu_powx(count, diff_.mutable_gpu_data(), // a_i-b_i + Dtype(2), diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 + caffe_gpu_gemv(CblasNoTrans, bottom[0]->num(), bottom[0]->channels(), + Dtype(1.0), diff_sq_.gpu_data(), // (a_i-b_i)^2 + summer_vec_.gpu_data(), Dtype(0.0), dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + Dtype loss(0.0); + for (int i = 0; i < bottom[0]->num(); ++i) { + if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs + loss += dist_sq_.cpu_data()[i]; + } else { // dissimilar pairs + if (legacy_version) { + loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); + } else { + Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0); + loss += dist * dist; + } + } + } + loss = loss / static_cast(bottom[0]->num()) / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; +} + +template +void ContrastiveLossLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const int count = bottom[0]->count(); + const int channels = bottom[0]->channels(); + Dtype margin = this->layer_param_.contrastive_loss_param().margin(); + const bool legacy_version = + this->layer_param_.contrastive_loss_param().legacy_version(); + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] + / static_cast(bottom[0]->num()); + // NOLINT_NEXT_LINE(whitespace/operators) + CLLBackward(count, channels, margin, legacy_version, alpha, + bottom[2]->gpu_data(), // pair similarity 0 or 1 + diff_.gpu_data(), // the cached eltwise difference between a and b + dist_sq_.gpu_data(), // the cached square distance between a and b + bottom[i]->mutable_gpu_diff()); + } + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(ContrastiveLossLayer); #endif -INSTANTIATE_CLASS(ContrastiveLossLayer); -REGISTER_LAYER_CLASS(ContrastiveLoss); +INSTANTIATE_CLASS (ContrastiveLossLayer); +REGISTER_LAYER_CLASS (ContrastiveLoss); } // namespace caffe diff --git a/src/caffe/layers/contrastive_loss_layer.cu b/src/caffe/layers/contrastive_loss_layer.cu deleted file mode 100644 index 93123931..00000000 --- a/src/caffe/layers/contrastive_loss_layer.cu +++ /dev/null @@ -1,111 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ContrastiveLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - const int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), // a - bottom[1]->gpu_data(), // b - diff_.mutable_gpu_data()); // a_i-b_i - caffe_gpu_powx( - count, - diff_.mutable_gpu_data(), // a_i-b_i - Dtype(2), - diff_sq_.mutable_gpu_data()); // (a_i-b_i)^2 - caffe_gpu_gemv( - CblasNoTrans, - bottom[0]->num(), - bottom[0]->channels(), - Dtype(1.0), - diff_sq_.gpu_data(), // (a_i-b_i)^2 - summer_vec_.gpu_data(), - Dtype(0.0), - dist_sq_.mutable_gpu_data()); // \Sum (a_i-b_i)^2 - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - Dtype loss(0.0); - for (int i = 0; i < bottom[0]->num(); ++i) { - if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs - loss += dist_sq_.cpu_data()[i]; - } else { // dissimilar pairs - if (legacy_version) { - loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); - } else { - Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), - Dtype(0.0)); - loss += dist*dist; - } - } - } - loss = loss / static_cast(bottom[0]->num()) / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; -} - -template -__global__ void CLLBackward(const int count, const int channels, - const Dtype margin, const bool legacy_version, const Dtype alpha, - const Dtype* y, const Dtype* diff, const Dtype* dist_sq, - Dtype *bottom_diff) { - CUDA_KERNEL_LOOP(i, count) { - int n = i / channels; // the num index, to access y and dist_sq - if (static_cast(y[n])) { // similar pairs - bottom_diff[i] = alpha * diff[i]; - } else { // dissimilar pairs - Dtype mdist(0.0); - Dtype beta(0.0); - if (legacy_version) { - mdist = (margin - dist_sq[n]); - beta = -alpha; - } else { - Dtype dist = sqrt(dist_sq[n]); - mdist = (margin - dist); - beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i]; - } - if (mdist > 0.0) { - bottom_diff[i] = beta; - } else { - bottom_diff[i] = 0; - } - } - } -} - -template -void ContrastiveLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const int count = bottom[0]->count(); - const int channels = bottom[0]->channels(); - Dtype margin = this->layer_param_.contrastive_loss_param().margin(); - const bool legacy_version = - this->layer_param_.contrastive_loss_param().legacy_version(); - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / - static_cast(bottom[0]->num()); - // NOLINT_NEXT_LINE(whitespace/operators) - CLLBackward<<>>( - count, channels, margin, legacy_version, alpha, - bottom[2]->gpu_data(), // pair similarity 0 or 1 - diff_.gpu_data(), // the cached eltwise difference between a and b - dist_sq_.gpu_data(), // the cached square distance between a and b - bottom[i]->mutable_gpu_diff()); - CUDA_POST_KERNEL_CHECK; - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ContrastiveLossLayer); - -} // namespace caffe diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index 928ef5ee..4bfd4dba 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -1,5 +1,4 @@ #include - #include "caffe/filler.hpp" #include "caffe/layer.hpp" #include "caffe/util/im2col.hpp" @@ -18,7 +17,7 @@ void ConvolutionLayer::compute_output_shape() { template void ConvolutionLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* weight = this->blobs_[0]->cpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->cpu_data(); @@ -32,11 +31,13 @@ void ConvolutionLayer::Forward_cpu(const vector*>& bottom, } } } + + // CHECK_BLOB_DATA(top[0],20, "top[0]"); } template void ConvolutionLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); for (int i = 0; i < top.size(); ++i) { @@ -65,12 +66,171 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, } } } + +} + +// begin: code modified for OpenCL port +#ifndef CPU_ONLY +template +void ConvolutionLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + //if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1) + //Forward_gpu_batched(bottom, top); + //else + Forward_gpu_org(bottom, top); +} + +template +void ConvolutionLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + //if (!this->is_1x1_ && use_packing_scheme && global_packing_N > 1) + Backward_gpu_batched(top, propagate_down, bottom); + //else + //Backward_gpu_org(top, propagate_down, bottom); +} + +template +void ConvolutionLayer::Forward_gpu_batched( + const vector*>& bottom, const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + //CHECK_BLOB_DATA(bottom[i],10,"bottom"); + + Dtype* top_data = top[i]->mutable_gpu_data(); + this->opt_num2 = global_packing_N; + this->weight_offset_ = this->M_ * this->K_; + for (int n = 0; n < this->num_; n += this->opt_num2) { + this->opt_num2 = + this->opt_num2 > (this->num_ - n) ? (this->num_ - n) : this->opt_num2; + //intermediate variables to pass offset + this->top_offset_opt = this->M_ * this->N_ * this->opt_num2; + this->top_offset_ = top[i]->offset(n); + this->col_offset_ = this->K_ * this->N_ * this->opt_num2; + this->bottom_offset_ = bottom[i]->offset(n); + this->forward_gpu_gemm_opt(bottom_data, weight, top_data); + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias_opt(top_data, bias); + } + } + } + + //CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + //CHECK_BLOB_DATA(top[0],20, "top[0]"); + +} + +template +void ConvolutionLayer::Forward_gpu_org( + const vector*>& bottom, const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* top_data = top[i]->mutable_gpu_data(); + for (int n = 0; n < this->num_; ++n) { + //two intermediate variables to pass offset + this->bottom_offset_ = bottom[i]->offset(n); + this->top_offset_ = top[i]->offset(n); + this->col_offset_ = this->K_ * this->N_; + this->forward_gpu_gemm(bottom_data, weight, top_data); + + if (this->bias_term_) { + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias(top_data, bias); + } + } + } + + // CHECK_BLOB_DATA(this->blobs_[0],20, "weights"); + //CHECK_BLOB_DATA(top[0],20, "top[0]"); +} + +template +void ConvolutionLayer::Backward_gpu_batched(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->backward_gpu_bias(bias_diff, top_diff); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + this->weight_offset_ = this->M_ * this->K_; + this->opt_num2 = global_packing_N; + for (int n = 0; n < this->num_; n += this->opt_num2) { + this->opt_num2 = + this->opt_num2 > (this->num_ - n) ? + (this->num_ - n) : this->opt_num2; + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + this->col_offset_ = this->K_ * (this->N_ * this->opt_num2); + this->top_offset_opt = this->M_ * (this->N_ * this->opt_num2); + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm_opt(bottom_data, top_diff, weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->backward_gpu_gemm_opt(top_diff, weight, bottom_diff); + } + } + } + } +} +template +void ConvolutionLayer::Backward_gpu_org(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + // + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + this->col_offset_ = this->K_ * this->N_; + this->backward_gpu_bias(bias_diff, top_diff); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->bottom_offset_ = bottom[i]->offset(n); + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm(bottom_data, top_diff, weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->backward_gpu_gemm(top_diff, weight, bottom_diff); + } + } + } + } + } +// end: code modified for OpenCL port -#ifdef CPU_ONLY +#else STUB_GPU(ConvolutionLayer); #endif -INSTANTIATE_CLASS(ConvolutionLayer); +INSTANTIATE_CLASS (ConvolutionLayer); } // namespace caffe diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu deleted file mode 100644 index b8a98ff7..00000000 --- a/src/caffe/layers/conv_layer.cu +++ /dev/null @@ -1,64 +0,0 @@ -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ConvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->forward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data + top[i]->offset(n), bias); - } - } - } -} - -template -void ConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm(bottom_data + bottom[i]->offset(n), - top_diff + top[i]->offset(n), weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->backward_gpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); - } - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ConvolutionLayer); - -} // namespace caffe diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp deleted file mode 100644 index 104d2b9d..00000000 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ /dev/null @@ -1,130 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -// Set to three for the benefit of the backward pass, which -// can use separate streams for calculating the gradient w.r.t. -// bias, filter weights, and bottom data for each group independently -#define CUDNN_STREAMS_PER_GROUP 3 - -/** - * TODO(dox) explain cuDNN interface - */ -template -void CuDNNConvolutionLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - ConvolutionLayer::LayerSetUp(bottom, top); - // Initialize CUDA streams and cuDNN. - stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; - handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP]; - workspaceSizeInBytes = 0; - workspace = NULL; - - for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { - CUDA_CHECK(cudaStreamCreate(&stream_[g])); - CUDNN_CHECK(cudnnCreate(&handle_[g])); - CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g])); - } - - // Set the indexing parameters. - weight_offset_ = (this->num_output_ / this->group_) - * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_; - bias_offset_ = (this->num_output_ / this->group_); - - // Create filter descriptor. - cudnn::createFilterDesc(&filter_desc_, - this->num_output_ / this->group_, this->channels_ / this->group_, - this->kernel_h_, this->kernel_w_); - - // Create tensor descriptor(s) for data and corresponding convolution(s). - for (int i = 0; i < bottom.size(); i++) { - cudnnTensorDescriptor_t bottom_desc; - cudnn::createTensor4dDesc(&bottom_desc); - bottom_descs_.push_back(bottom_desc); - cudnnTensorDescriptor_t top_desc; - cudnn::createTensor4dDesc(&top_desc); - top_descs_.push_back(top_desc); - cudnnConvolutionDescriptor_t conv_desc; - cudnn::createConvolutionDesc(&conv_desc); - conv_descs_.push_back(conv_desc); - } - - // Tensor descriptor for bias. - if (this->bias_term_) { - cudnn::createTensor4dDesc(&bias_desc_); - } - - handles_setup_ = true; -} - -template -void CuDNNConvolutionLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - ConvolutionLayer::Reshape(bottom, top); - bottom_offset_ = (this->channels_ / this->group_) - * this->height_ * this->width_; - top_offset_ = (this->num_output_ / this->group_) - * this->height_out_ * this->width_out_; - - for (int i = 0; i < bottom.size(); i++) { - cudnn::setTensor4dDesc(&bottom_descs_[i], - this->num_, - this->channels_ / this->group_, - this->height_, this->width_, - this->channels_ * this->height_ * this->width_, - this->height_ * this->width_, - this->width_, 1); - cudnn::setTensor4dDesc(&top_descs_[i], - this->num_, - this->num_output_ / this->group_, - this->height_out_, this->width_out_, - this->num_output_ * this->height_out_ * this->width_out_, - this->height_out_ * this->width_out_, - this->width_out_, 1); - cudnn::setConvolutionDesc(&conv_descs_[i], bottom_descs_[i], - filter_desc_, this->pad_h_, this->pad_w_, - this->stride_h_, this->stride_w_); - } - - // Tensor descriptor for bias. - if (this->bias_term_) { - cudnn::setTensor4dDesc(&bias_desc_, - 1, this->num_output_ / this->group_, 1, 1); - } -} - -template -CuDNNConvolutionLayer::~CuDNNConvolutionLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - for (int i = 0; i < bottom_descs_.size(); i++) { - cudnnDestroyTensorDescriptor(bottom_descs_[i]); - cudnnDestroyTensorDescriptor(top_descs_[i]); - cudnnDestroyConvolutionDescriptor(conv_descs_[i]); - } - if (this->bias_term_) { - cudnnDestroyTensorDescriptor(bias_desc_); - } - cudnnDestroyFilterDescriptor(filter_desc_); - - for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) { - cudaStreamDestroy(stream_[g]); - cudnnDestroy(handle_[g]); - } - - delete [] stream_; - delete [] handle_; -} - -INSTANTIATE_CLASS(CuDNNConvolutionLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu deleted file mode 100644 index b4e802e1..00000000 --- a/src/caffe/layers/cudnn_conv_layer.cu +++ /dev/null @@ -1,160 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -__global__ void sync_conv_groups() { } - -template -void CuDNNConvolutionLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - - size_t workspace_limit_bytes = this->kernel_h_ * - this->kernel_w_ * - this->channels_ * - sizeof(int) + 1; - - // Forward through cuDNN in parallel over groups. - for (int g = 0; g < this->group_; g++) { - cudnnConvolutionFwdAlgo_t algo; - - // pick the convolution algorithm - // TODO(shelhamer) this should be done during reshape - // TODO(shelhamer) the choice of automatic or manual algorithm picking - // should be exposed in proto - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[g], - bottom_descs_[i], - filter_desc_, - conv_descs_[i], - top_descs_[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, // memoryLimitInBytes, - &algo)); - - // get minimum size of the workspace needed for the desired algorithm - size_t workspaceSizeInBytes_temp = 0; - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[g], - bottom_descs_[i], - filter_desc_, - conv_descs_[i], - top_descs_[i], - algo, - &workspaceSizeInBytes_temp)); - - if (workspaceSizeInBytes_temp > workspaceSizeInBytes) { - workspaceSizeInBytes = workspaceSizeInBytes_temp; - // free the existing workspace and allocate a new (larger) one - cudaFree(this->workspace); - cudaError_t err = cudaMalloc(&(this->workspace), workspaceSizeInBytes); - if (err != cudaSuccess) { - // force zero memory path - algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - workspace = NULL; - workspaceSizeInBytes = 0; - } - } - - // Filters. - CUDNN_CHECK(cudnnConvolutionForward(handle_[g], - cudnn::dataType::one, - bottom_descs_[i], bottom_data + bottom_offset_ * g, - filter_desc_, weight + weight_offset_ * g, - conv_descs_[i], - algo, workspace, workspaceSizeInBytes, - cudnn::dataType::zero, - top_descs_[i], top_data + top_offset_ * g)); - - // Bias. - if (this->bias_term_) { - const Dtype* bias_data = this->blobs_[1]->gpu_data(); - CUDNN_CHECK(cudnnAddTensor(handle_[g], CUDNN_ADD_SAME_C, - cudnn::dataType::one, - bias_desc_, bias_data + bias_offset_ * g, - cudnn::dataType::one, - top_descs_[i], top_data + top_offset_ * g)); - } - } - - // Synchronize the work across groups, each of which went into its own - // stream, by launching an empty kernel into the default (null) stream. - // NOLINT_NEXT_LINE(whitespace/operators) - sync_conv_groups<<<1, 1>>>(); - } -} - -template -void CuDNNConvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = NULL; - Dtype* weight_diff = NULL; - if (this->param_propagate_down_[0]) { - weight = this->blobs_[0]->gpu_data(); - weight_diff = this->blobs_[0]->mutable_gpu_diff(); - } - Dtype* bias_diff = NULL; - if (this->bias_term_ && this->param_propagate_down_[1]) { - bias_diff = this->blobs_[1]->mutable_gpu_diff(); - } - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - // Backward through cuDNN in parallel over groups and gradients. - for (int g = 0; g < this->group_; g++) { - // Gradient w.r.t. bias. - if (this->bias_term_ && this->param_propagate_down_[1]) { - CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0*this->group_ + g], - cudnn::dataType::one, - top_descs_[i], top_diff + top_offset_ * g, - cudnn::dataType::one, - bias_desc_, bias_diff + bias_offset_ * g)); - } - - // Gradient w.r.t. weights. - if (this->param_propagate_down_[0]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - CUDNN_CHECK(cudnnConvolutionBackwardFilter(handle_[1*this->group_ + g], - cudnn::dataType::one, - bottom_descs_[i], bottom_data + bottom_offset_ * g, - top_descs_[i], top_diff + top_offset_ * g, - conv_descs_[i], - cudnn::dataType::one, - filter_desc_, weight_diff + weight_offset_ * g)); - } - - // Gradient w.r.t. bottom data. - if (propagate_down[i]) { - if (weight == NULL) { - weight = this->blobs_[0]->gpu_data(); - } - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g], - cudnn::dataType::one, - filter_desc_, weight + weight_offset_ * g, - top_descs_[i], top_diff + top_offset_ * g, - conv_descs_[i], - cudnn::dataType::zero, - bottom_descs_[i], bottom_diff + bottom_offset_ * g)); - } - } - - // Synchronize the work across groups, each of which went into its own - // stream, by launching an empty kernel into the default (null) stream. - // NOLINT_NEXT_LINE(whitespace/operators) - sync_conv_groups<<<1, 1>>>(); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNConvolutionLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_pooling_layer.cpp b/src/caffe/layers/cudnn_pooling_layer.cpp deleted file mode 100644 index c92c4e47..00000000 --- a/src/caffe/layers/cudnn_pooling_layer.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNPoolingLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - PoolingLayer::LayerSetUp(bottom, top); - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - cudnn::createPoolingDesc(&pooling_desc_, - this->layer_param_.pooling_param().pool(), &mode_, - this->kernel_h_, this->kernel_w_, this->pad_h_, this->pad_w_, - this->stride_h_, this->stride_w_); - handles_setup_ = true; -} - -template -void CuDNNPoolingLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - PoolingLayer::Reshape(bottom, top); - cudnn::setTensor4dDesc(&bottom_desc_, bottom[0]->num(), - this->channels_, this->height_, this->width_); - cudnn::setTensor4dDesc(&top_desc_, bottom[0]->num(), - this->channels_, this->pooled_height_, this->pooled_width_); -} - -template -CuDNNPoolingLayer::~CuDNNPoolingLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(bottom_desc_); - cudnnDestroyTensorDescriptor(top_desc_); - cudnnDestroyPoolingDescriptor(pooling_desc_); - cudnnDestroy(handle_); -} - -INSTANTIATE_CLASS(CuDNNPoolingLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_pooling_layer.cu b/src/caffe/layers/cudnn_pooling_layer.cu deleted file mode 100644 index a952b855..00000000 --- a/src/caffe/layers/cudnn_pooling_layer.cu +++ /dev/null @@ -1,45 +0,0 @@ -#ifdef USE_CUDNN -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNPoolingLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnPoolingForward(handle_, pooling_desc_, - cudnn::dataType::one, - bottom_desc_, bottom_data, - cudnn::dataType::zero, - top_desc_, top_data)); -} - -template -void CuDNNPoolingLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnPoolingBackward(handle_, pooling_desc_, - cudnn::dataType::one, - top_desc_, top_data, top_desc_, top_diff, - bottom_desc_, bottom_data, - cudnn::dataType::zero, - bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNPoolingLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp deleted file mode 100644 index 759d8398..00000000 --- a/src/caffe/layers/cudnn_relu_layer.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNReLULayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - ReLULayer::LayerSetUp(bottom, top); - // initialize cuDNN - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNReLULayer::Reshape(const vector*>& bottom, - const vector*>& top) { - ReLULayer::Reshape(bottom, top); - const int N = bottom[0]->num(); - const int K = bottom[0]->channels(); - const int H = bottom[0]->height(); - const int W = bottom[0]->width(); - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNReLULayer::~CuDNNReLULayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(this->bottom_desc_); - cudnnDestroyTensorDescriptor(this->top_desc_); - cudnnDestroy(this->handle_); -} - -INSTANTIATE_CLASS(CuDNNReLULayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_relu_layer.cu b/src/caffe/layers/cudnn_relu_layer.cu deleted file mode 100644 index 21d14857..00000000 --- a/src/caffe/layers/cudnn_relu_layer.cu +++ /dev/null @@ -1,57 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - // Fallback to standard Caffe for leaky ReLU. - if (ReLULayer::layer_param_.relu_param().negative_slope() != 0) { - return ReLULayer::Forward_gpu(bottom, top); - } - - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(this->handle_, - CUDNN_ACTIVATION_RELU, - cudnn::dataType::one, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->top_desc_, top_data)); -} - -template -void CuDNNReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - - // Fallback to standard Caffe for leaky ReLU. - if (ReLULayer::layer_param_.relu_param().negative_slope() != 0) { - return ReLULayer::Backward_gpu(top, propagate_down, bottom); - } - - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnActivationBackward(this->handle_, - CUDNN_ACTIVATION_RELU, - cudnn::dataType::one, - this->top_desc_, top_data, this->top_desc_, top_diff, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNReLULayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cpp b/src/caffe/layers/cudnn_sigmoid_layer.cpp deleted file mode 100644 index 32637873..00000000 --- a/src/caffe/layers/cudnn_sigmoid_layer.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSigmoidLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - SigmoidLayer::LayerSetUp(bottom, top); - // initialize cuDNN - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNSigmoidLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - SigmoidLayer::Reshape(bottom, top); - const int N = bottom[0]->num(); - const int K = bottom[0]->channels(); - const int H = bottom[0]->height(); - const int W = bottom[0]->width(); - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNSigmoidLayer::~CuDNNSigmoidLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(this->bottom_desc_); - cudnnDestroyTensorDescriptor(this->top_desc_); - cudnnDestroy(this->handle_); -} - -INSTANTIATE_CLASS(CuDNNSigmoidLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_sigmoid_layer.cu b/src/caffe/layers/cudnn_sigmoid_layer.cu deleted file mode 100644 index 7a06cf72..00000000 --- a/src/caffe/layers/cudnn_sigmoid_layer.cu +++ /dev/null @@ -1,47 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSigmoidLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(this->handle_, - CUDNN_ACTIVATION_SIGMOID, - cudnn::dataType::one, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->top_desc_, top_data)); -} - -template -void CuDNNSigmoidLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - CUDNN_CHECK(cudnnActivationBackward(this->handle_, - CUDNN_ACTIVATION_SIGMOID, - cudnn::dataType::one, - this->top_desc_, top_data, this->top_desc_, top_diff, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSigmoidLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_softmax_layer.cpp b/src/caffe/layers/cudnn_softmax_layer.cpp deleted file mode 100644 index 77a3225a..00000000 --- a/src/caffe/layers/cudnn_softmax_layer.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#ifdef USE_CUDNN -#include -#include -#include - -#include "thrust/device_vector.h" - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSoftmaxLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - SoftmaxLayer::LayerSetUp(bottom, top); - // Initialize CUDNN. - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNSoftmaxLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - SoftmaxLayer::Reshape(bottom, top); - int N = this->outer_num_; - int K = bottom[0]->shape(this->softmax_axis_); - int H = this->inner_num_; - int W = 1; - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNSoftmaxLayer::~CuDNNSoftmaxLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(bottom_desc_); - cudnnDestroyTensorDescriptor(top_desc_); - cudnnDestroy(handle_); -} - -INSTANTIATE_CLASS(CuDNNSoftmaxLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_softmax_layer.cu b/src/caffe/layers/cudnn_softmax_layer.cu deleted file mode 100644 index a9e2fcef..00000000 --- a/src/caffe/layers/cudnn_softmax_layer.cu +++ /dev/null @@ -1,48 +0,0 @@ -#ifdef USE_CUDNN -#include -#include -#include - -#include "thrust/device_vector.h" - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNSoftmaxLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnSoftmaxForward(handle_, CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - cudnn::dataType::one, - bottom_desc_, bottom_data, - cudnn::dataType::zero, - top_desc_, top_data)); -} - -template -void CuDNNSoftmaxLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - CUDNN_CHECK(cudnnSoftmaxBackward(handle_, CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - cudnn::dataType::one, - top_desc_, top_data, top_desc_, top_diff, - cudnn::dataType::zero, - bottom_desc_, bottom_diff)); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNSoftmaxLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_tanh_layer.cpp b/src/caffe/layers/cudnn_tanh_layer.cpp deleted file mode 100644 index 376faad3..00000000 --- a/src/caffe/layers/cudnn_tanh_layer.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNTanHLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - TanHLayer::LayerSetUp(bottom, top); - // initialize cuDNN - CUDNN_CHECK(cudnnCreate(&handle_)); - cudnn::createTensor4dDesc(&bottom_desc_); - cudnn::createTensor4dDesc(&top_desc_); - handles_setup_ = true; -} - -template -void CuDNNTanHLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - TanHLayer::Reshape(bottom, top); - const int N = bottom[0]->num(); - const int K = bottom[0]->channels(); - const int H = bottom[0]->height(); - const int W = bottom[0]->width(); - cudnn::setTensor4dDesc(&bottom_desc_, N, K, H, W); - cudnn::setTensor4dDesc(&top_desc_, N, K, H, W); -} - -template -CuDNNTanHLayer::~CuDNNTanHLayer() { - // Check that handles have been setup before destroying. - if (!handles_setup_) { return; } - - cudnnDestroyTensorDescriptor(this->bottom_desc_); - cudnnDestroyTensorDescriptor(this->top_desc_); - cudnnDestroy(this->handle_); -} - -INSTANTIATE_CLASS(CuDNNTanHLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/cudnn_tanh_layer.cu b/src/caffe/layers/cudnn_tanh_layer.cu deleted file mode 100644 index d287f6fe..00000000 --- a/src/caffe/layers/cudnn_tanh_layer.cu +++ /dev/null @@ -1,48 +0,0 @@ -#ifdef USE_CUDNN -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void CuDNNTanHLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - CUDNN_CHECK(cudnnActivationForward(this->handle_, - CUDNN_ACTIVATION_TANH, - cudnn::dataType::one, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->top_desc_, top_data)); -} - -template -void CuDNNTanHLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - CUDNN_CHECK(cudnnActivationBackward(this->handle_, - CUDNN_ACTIVATION_TANH, - cudnn::dataType::one, - this->top_desc_, top_data, this->top_desc_, top_diff, - this->bottom_desc_, bottom_data, - cudnn::dataType::zero, - this->bottom_desc_, bottom_diff)); -} - -INSTANTIATE_LAYER_GPU_FUNCS(CuDNNTanHLayer); - -} // namespace caffe -#endif diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp index 161a75e0..fdae75a0 100644 --- a/src/caffe/layers/data_layer.cpp +++ b/src/caffe/layers/data_layer.cpp @@ -23,7 +23,7 @@ DataLayer::~DataLayer() { template void DataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // Initialize DB db_.reset(db::GetDB(this->layer_param_.data_param().backend())); db_->Open(this->layer_param_.data_param().source(), db::READ); @@ -31,8 +31,8 @@ void DataLayer::DataLayerSetUp(const vector*>& bottom, // Check if we should randomly skip a few data points if (this->layer_param_.data_param().rand_skip()) { - unsigned int skip = caffe_rng_rand() % - this->layer_param_.data_param().rand_skip(); + unsigned int skip = caffe_rng_rand() + % this->layer_param_.data_param().rand_skip(); LOG(INFO) << "Skipping first " << skip << " data points."; while (skip-- > 0) { cursor_->Next(); @@ -48,6 +48,7 @@ void DataLayer::DataLayerSetUp(const vector*>& bottom, top_shape[0] = this->layer_param_.data_param().batch_size(); this->prefetch_data_.Reshape(top_shape); top[0]->ReshapeLike(this->prefetch_data_); + this->prefetch_data_.set_data_layer(); LOG(INFO) << "output data size: " << top[0]->num() << "," << top[0]->channels() << "," << top[0]->height() << "," @@ -57,6 +58,7 @@ void DataLayer::DataLayerSetUp(const vector*>& bottom, vector label_shape(1, this->layer_param_.data_param().batch_size()); top[1]->Reshape(label_shape); this->prefetch_label_.Reshape(label_shape); + this->prefetch_label_.set_data_layer(); } } @@ -120,7 +122,7 @@ void DataLayer::InternalThreadEntry() { DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; } -INSTANTIATE_CLASS(DataLayer); -REGISTER_LAYER_CLASS(Data); +INSTANTIATE_CLASS (DataLayer); +REGISTER_LAYER_CLASS (Data); } // namespace caffe diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index a4612963..ddf906b7 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -18,7 +18,7 @@ void DeconvolutionLayer::compute_output_shape() { template void DeconvolutionLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* weight = this->blobs_[0]->cpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->cpu_data(); @@ -36,7 +36,7 @@ void DeconvolutionLayer::Forward_cpu(const vector*>& bottom, template void DeconvolutionLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); for (int i = 0; i < top.size(); ++i) { @@ -69,11 +69,69 @@ void DeconvolutionLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port + +template +void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + for (int i = 0; i < bottom.size(); ++i) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* top_data = top[i]->mutable_gpu_data(); + for (int n = 0; n < this->num_; ++n) { + this->bottom_offset_ = top[i]->offset(n); + this->top_offset_ = bottom[i]->offset(n); + this->backward_gpu_gemm(bottom_data, weight, top_data); + if (this->bias_term_) { + this->top_offset_ = top[i]->offset(n); + const Dtype* bias = this->blobs_[1]->gpu_data(); + this->forward_gpu_bias(top_data, bias); + } + } + } +} + +template +void DeconvolutionLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + const Dtype* weight = this->blobs_[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + // Bias gradient, if necessary. + if (this->bias_term_ && this->param_propagate_down_[1]) { + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = top[i]->offset(n); + this->backward_gpu_bias(bias_diff, top_diff); + } + } + if (this->param_propagate_down_[0] || propagate_down[i]) { + for (int n = 0; n < this->num_; ++n) { + this->top_offset_ = bottom[i]->offset(n); + this->bottom_offset_ = top[i]->offset(n); + // gradient w.r.t. weight. Note that we will accumulate diffs. + if (this->param_propagate_down_[0]) { + this->weight_gpu_gemm(top_diff, bottom_data, weight_diff); + } + // gradient w.r.t. bottom data, if necessary. + if (propagate_down[i]) { + this->forward_gpu_gemm(top_diff, weight, bottom_diff); + } + } + } + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(DeconvolutionLayer); #endif -INSTANTIATE_CLASS(DeconvolutionLayer); -REGISTER_LAYER_CLASS(Deconvolution); +INSTANTIATE_CLASS (DeconvolutionLayer); +REGISTER_LAYER_CLASS (Deconvolution); } // namespace caffe diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu deleted file mode 100644 index 39bc4de8..00000000 --- a/src/caffe/layers/deconv_layer.cu +++ /dev/null @@ -1,64 +0,0 @@ -#include - -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void DeconvolutionLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - for (int i = 0; i < bottom.size(); ++i) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* top_data = top[i]->mutable_gpu_data(); - for (int n = 0; n < this->num_; ++n) { - this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight, - top_data + top[i]->offset(n)); - if (this->bias_term_) { - const Dtype* bias = this->blobs_[1]->gpu_data(); - this->forward_gpu_bias(top_data + top[i]->offset(n), bias); - } - } - } -} - -template -void DeconvolutionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* weight = this->blobs_[0]->gpu_data(); - Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - // Bias gradient, if necessary. - if (this->bias_term_ && this->param_propagate_down_[1]) { - Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); - for (int n = 0; n < this->num_; ++n) { - this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n)); - } - } - if (this->param_propagate_down_[0] || propagate_down[i]) { - for (int n = 0; n < this->num_; ++n) { - // gradient w.r.t. weight. Note that we will accumulate diffs. - if (this->param_propagate_down_[0]) { - this->weight_gpu_gemm(top_diff + top[i]->offset(n), - bottom_data + bottom[i]->offset(n), weight_diff); - } - // gradient w.r.t. bottom data, if necessary. - if (propagate_down[i]) { - this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight, - bottom_diff + bottom[i]->offset(n)); - } - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(DeconvolutionLayer); - -} // namespace caffe diff --git a/src/caffe/layers/dropout_layer.cpp b/src/caffe/layers/dropout_layer.cpp index ec1256fd..21699414 100644 --- a/src/caffe/layers/dropout_layer.cpp +++ b/src/caffe/layers/dropout_layer.cpp @@ -12,19 +12,19 @@ namespace caffe { template void DropoutLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); - threshold_ = this->layer_param_.dropout_param().dropout_ratio(); - DCHECK(threshold_ > 0.); - DCHECK(threshold_ < 1.); - scale_ = 1. / (1. - threshold_); - uint_thres_ = static_cast(UINT_MAX * threshold_); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); + threshold_ = this->layer_param_.dropout_param().dropout_ratio(); + DCHECK(threshold_ > 0.); + DCHECK(threshold_ < 1.); + scale_ = 1. / (1. - threshold_); + uint_thres_ = static_cast(UINT_MAX * threshold_); } template void DropoutLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::Reshape(bottom, top); + const vector*>& top) { + NeuronLayer < Dtype > ::Reshape(bottom, top); // Set up the cache for random number generation rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(), bottom[0]->height(), bottom[0]->width()); @@ -50,8 +50,7 @@ void DropoutLayer::Forward_cpu(const vector*>& bottom, template void DropoutLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[0]) { const Dtype* top_diff = top[0]->cpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); @@ -67,12 +66,52 @@ void DropoutLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void DropoutLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + if (this->phase_ == TRAIN) { + unsigned int* mask = + static_cast(rand_vec_.mutable_gpu_data()); + caffe_gpu_rng_uniform(count, mask); + // set thresholds + // NOLINT_NEXT_LINE(whitespace/operators) + DropoutForward(count, bottom_data, mask, uint_thres_, scale_, top_data); + } else { + if(bottom_data != top_data) + caffe_gpu_copy(count, bottom_data, top_data); + } +} + +template +void DropoutLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + if (this->phase_ == TRAIN) { + const unsigned int* mask = + static_cast(rand_vec_.gpu_data()); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + DropoutBackward(count, top_diff, mask, uint_thres_, scale_, bottom_diff); + } else { + if(bottom_diff != top_diff) + caffe_gpu_copy(top[0]->count(), top_diff, bottom_diff); + } + } +} +// end: code modified for OpenCL port -#ifdef CPU_ONLY +#else STUB_GPU(DropoutLayer); #endif -INSTANTIATE_CLASS(DropoutLayer); -REGISTER_LAYER_CLASS(Dropout); +INSTANTIATE_CLASS (DropoutLayer); +REGISTER_LAYER_CLASS (Dropout); } // namespace caffe diff --git a/src/caffe/layers/dropout_layer.cu b/src/caffe/layers/dropout_layer.cu deleted file mode 100644 index f9ea04f4..00000000 --- a/src/caffe/layers/dropout_layer.cu +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include - -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/syncedmem.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - - -template -__global__ void DropoutForward(const int n, const Dtype* in, - const unsigned int* mask, const unsigned int threshold, const float scale, - Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] * (mask[index] > threshold) * scale; - } -} - -template -void DropoutLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - if (this->phase_ == TRAIN) { - unsigned int* mask = - static_cast(rand_vec_.mutable_gpu_data()); - caffe_gpu_rng_uniform(count, mask); - // set thresholds - // NOLINT_NEXT_LINE(whitespace/operators) - DropoutForward<<>>( - count, bottom_data, mask, uint_thres_, scale_, top_data); - CUDA_POST_KERNEL_CHECK; - } else { - caffe_copy(count, bottom_data, top_data); - } -} - -template -__global__ void DropoutBackward(const int n, const Dtype* in_diff, - const unsigned int* mask, const unsigned int threshold, const float scale, - Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = in_diff[index] * scale * (mask[index] > threshold); - } -} - -template -void DropoutLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - if (this->phase_ == TRAIN) { - const unsigned int* mask = - static_cast(rand_vec_.gpu_data()); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - DropoutBackward<<>>( - count, top_diff, mask, uint_thres_, scale_, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } else { - caffe_copy(top[0]->count(), top_diff, bottom_diff); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(DropoutLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/dummy_data_layer.cpp b/src/caffe/layers/dummy_data_layer.cpp index 6b0d6174..f13f3be1 100644 --- a/src/caffe/layers/dummy_data_layer.cpp +++ b/src/caffe/layers/dummy_data_layer.cpp @@ -8,37 +8,38 @@ namespace caffe { template void DummyDataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int num_top = top.size(); const DummyDataParameter& param = this->layer_param_.dummy_data_param(); const int num_data_filler = param.data_filler_size(); - CHECK(num_data_filler == 0 || num_data_filler == 1 || - num_data_filler == num_top) + CHECK( + num_data_filler == 0 || num_data_filler == 1 + || num_data_filler == num_top) << "Number of data fillers must be 0, 1 or equal to the number of tops: " << num_top << "; you specified " << num_data_filler << " data fillers."; - const bool legacy_dims = param.num_size() || param.channels_size() || - param.height_size() || param.width_size(); + const bool legacy_dims = param.num_size() || param.channels_size() + || param.height_size() || param.width_size(); if (legacy_dims) { CHECK_EQ(0, param.shape_size()) << "Both shape and legacy fields were specified"; // Using deprecated 4D output dim specifiers. CHECK(param.num_size() == 1 || param.num_size() == num_top) - << "Must specify 'num' once, or once per top blob " - << "(" << num_top << "); specified " << param.num_size() << "."; + << "Must specify 'num' once, or once per top blob " << "(" << num_top + << "); specified " << param.num_size() << "."; CHECK(param.channels_size() == 1 || param.channels_size() == num_top) - << "Must specify 'channels' once, or once per top blob " - << "(" << num_top << "); specified " << param.channels_size() << "."; + << "Must specify 'channels' once, or once per top blob " << "(" + << num_top << "); specified " << param.channels_size() << "."; CHECK(param.height_size() == 1 || param.height_size() == num_top) - << "Must specify 'height' once, or once per top blob " - << "(" << num_top << "); specified " << param.height_size() << "."; + << "Must specify 'height' once, or once per top blob " << "(" << num_top + << "); specified " << param.height_size() << "."; CHECK(param.width_size() == 1 || param.width_size() == num_top) - << "Must specify 'width' once, or once per top blob " - << "(" << num_top << "); specified " << param.width_size() << "."; + << "Must specify 'width' once, or once per top blob " << "(" << num_top + << "); specified " << param.width_size() << "."; } else { CHECK(param.shape_size() == 1 || param.shape_size() == num_top) - << "Must specify 'shape' once, or once per top blob " - << "(" << num_top << "); specified " << param.shape_size() << "."; + << "Must specify 'shape' once, or once per top blob " << "(" << num_top + << "); specified " << param.shape_size() << "."; } // refill_[i] tells Forward i whether or not to actually refill top Blob i. // If refill_[i] is false, Forward does nothing for Blob i. We use this to @@ -62,12 +63,12 @@ void DummyDataLayer::LayerSetUp(const vector*>& bottom, refill_.resize(1); refill_[0] = (strcmp(filler_param.type().c_str(), "constant") == 0); fillers_.resize(1); - fillers_[0].reset(GetFiller(filler_param)); + fillers_[0].reset(GetFiller < Dtype > (filler_param)); } else { refill_.resize(num_top); fillers_.resize(num_top); for (int i = 0; i < num_top; ++i) { - fillers_[i].reset(GetFiller(param.data_filler(i))); + fillers_[i].reset(GetFiller < Dtype > (param.data_filler(i))); // Refill on each iteration iff not using a constant filler, // but use the inverse of this rule for the first run. refill_[i] = @@ -100,7 +101,7 @@ void DummyDataLayer::LayerSetUp(const vector*>& bottom, template void DummyDataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { for (int i = 0; i < top.size(); ++i) { const int filler_id = (fillers_.size() > 1) ? i : 0; if (refill_[filler_id]) { @@ -109,7 +110,7 @@ void DummyDataLayer::Forward_cpu(const vector*>& bottom, } } -INSTANTIATE_CLASS(DummyDataLayer); -REGISTER_LAYER_CLASS(DummyData); +INSTANTIATE_CLASS (DummyDataLayer); +REGISTER_LAYER_CLASS (DummyData); } // namespace caffe diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index a8070073..84cc279c 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -9,17 +9,19 @@ namespace caffe { template void EltwiseLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - CHECK(this->layer_param().eltwise_param().coeff_size() == 0 - || this->layer_param().eltwise_param().coeff_size() == bottom.size()) << - "Eltwise Layer takes one coefficient per bottom blob."; - CHECK(!(this->layer_param().eltwise_param().operation() - == EltwiseParameter_EltwiseOp_PROD - && this->layer_param().eltwise_param().coeff_size())) << - "Eltwise layer only takes coefficients for summation."; + const vector*>& top) { + CHECK( + this->layer_param().eltwise_param().coeff_size() == 0 + || this->layer_param().eltwise_param().coeff_size() == bottom.size()) + << "Eltwise Layer takes one coefficient per bottom blob."; + CHECK( + !(this->layer_param().eltwise_param().operation() + == EltwiseParameter_EltwiseOp_PROD + && this->layer_param().eltwise_param().coeff_size())) + << "Eltwise layer only takes coefficients for summation."; op_ = this->layer_param_.eltwise_param().operation(); // Blob-wise coefficients for the elementwise operation. - coeffs_ = vector(bottom.size(), 1); + coeffs_ = vector < Dtype > (bottom.size(), 1); if (this->layer_param().eltwise_param().coeff_size()) { for (int i = 0; i < bottom.size(); ++i) { coeffs_[i] = this->layer_param().eltwise_param().coeff(i); @@ -30,21 +32,21 @@ void EltwiseLayer::LayerSetUp(const vector*>& bottom, template void EltwiseLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { for (int i = 1; i < bottom.size(); ++i) { CHECK(bottom[i]->shape() == bottom[0]->shape()); } top[0]->ReshapeLike(*bottom[0]); // If max operation, we will initialize the vector index part. - if (this->layer_param_.eltwise_param().operation() == - EltwiseParameter_EltwiseOp_MAX && top.size() == 1) { + if (this->layer_param_.eltwise_param().operation() + == EltwiseParameter_EltwiseOp_MAX && top.size() == 1) { max_idx_.Reshape(bottom[0]->shape()); } } template -void EltwiseLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { +void EltwiseLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { int* mask = NULL; const Dtype* bottom_data_a = NULL; const Dtype* bottom_data_b = NULL; @@ -113,13 +115,14 @@ void EltwiseLayer::Backward_cpu(const vector*>& top, if (stable_prod_grad_) { bool initialized = false; for (int j = 0; j < bottom.size(); ++j) { - if (i == j) { continue; } + if (i == j) { + continue; + } if (!initialized) { caffe_copy(count, bottom[j]->cpu_data(), bottom_diff); initialized = true; } else { - caffe_mul(count, bottom[j]->cpu_data(), bottom_diff, - bottom_diff); + caffe_mul(count, bottom[j]->cpu_data(), bottom_diff, bottom_diff); } } } else { @@ -151,11 +154,100 @@ void EltwiseLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +// begin: code modified for OpenCL port +#ifndef CPU_ONLY +template +void EltwiseLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + int* mask = NULL; + const int count = top[0]->count(); + Dtype* top_data = top[0]->mutable_gpu_data(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), + top_data); + for (int i = 2; i < bottom.size(); ++i) { + caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_SUM: + caffe_gpu_set(count, Dtype(0.), top_data); + // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? + for (int i = 0; i < bottom.size(); ++i) { + caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + mask = max_idx_.mutable_gpu_data(); + // NOLINT_NEXT_LINE(whitespace/operators) + MaxForward(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, + mask); + for (int i = 2; i < bottom.size(); ++i) { + // NOLINT_NEXT_LINE(whitespace/operators) + MaxForward(count, top_data, bottom[i]->gpu_data(), i - 1, top_data, mask); + } + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } +} + +template +void EltwiseLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + const int* mask = NULL; + const int count = top[0]->count(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + const Dtype* bottom_data = bottom[i]->gpu_data(); + Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); + switch (op_) { + case EltwiseParameter_EltwiseOp_PROD: + if (stable_prod_grad_) { + bool initialized = false; + for (int j = 0; j < bottom.size(); ++j) { + if (i == j) { + continue; + } + if (!initialized) { + caffe_gpu_copy(count, bottom[j]->gpu_data(), bottom_diff); + initialized = true; + } else { + caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, + bottom_diff); + } + } + } else { + caffe_gpu_div(count, top_data, bottom_data, bottom_diff); + } + caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); + break; + case EltwiseParameter_EltwiseOp_SUM: + if (coeffs_[i] == Dtype(1.)) { + caffe_gpu_copy(count, top_diff, bottom_diff); + } else { + caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); + } + break; + case EltwiseParameter_EltwiseOp_MAX: + mask = max_idx_.gpu_data(); + MaxBackward(count, top_diff, i, mask, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown elementwise operation."; + } + } + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(EltwiseLayer); #endif -INSTANTIATE_CLASS(EltwiseLayer); -REGISTER_LAYER_CLASS(Eltwise); +INSTANTIATE_CLASS (EltwiseLayer); +REGISTER_LAYER_CLASS (Eltwise); } // namespace caffe diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu deleted file mode 100644 index 2247870d..00000000 --- a/src/caffe/layers/eltwise_layer.cu +++ /dev/null @@ -1,135 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void MaxForward(const int nthreads, const Dtype* bottom_data_a, - const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, - int* mask) { - CUDA_KERNEL_LOOP(index, nthreads) { - Dtype maxval = -FLT_MAX; - int maxidx = -1; - if (bottom_data_a[index] > bottom_data_b[index]) { - // only update for very first bottom_data blob (blob_idx == 0) - if (blob_idx == 0) { - maxval = bottom_data_a[index]; - top_data[index] = maxval; - maxidx = blob_idx; - mask[index] = maxidx; - } - } else { - maxval = bottom_data_b[index]; - top_data[index] = maxval; - maxidx = blob_idx + 1; - mask[index] = maxidx; - } - } -} - -template -void EltwiseLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int* mask = NULL; - const int count = top[0]->count(); - Dtype* top_data = top[0]->mutable_gpu_data(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), - top_data); - for (int i = 2; i < bottom.size(); ++i) { - caffe_gpu_mul(count, top_data, bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_SUM: - caffe_gpu_set(count, Dtype(0.), top_data); - // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? - for (int i = 0; i < bottom.size(); ++i) { - caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.mutable_gpu_data(); - // NOLINT_NEXT_LINE(whitespace/operators) - MaxForward <<>>( - count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), 0, top_data, mask); - for (int i = 2; i < bottom.size(); ++i) { - // NOLINT_NEXT_LINE(whitespace/operators) - MaxForward<<>>( - count, top_data, bottom[i]->gpu_data(), i-1, top_data, mask); - } - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } -} - -template -__global__ void MaxBackward(const int nthreads, const Dtype* top_diff, - const int blob_idx, const int* mask, Dtype* bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - Dtype gradient = 0; - if (mask[index] == blob_idx) { - gradient += top_diff[index]; - } - bottom_diff[index] = gradient; - } -} - -template -void EltwiseLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const int* mask = NULL; - const int count = top[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - const Dtype* bottom_data = bottom[i]->gpu_data(); - Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); - switch (op_) { - case EltwiseParameter_EltwiseOp_PROD: - if (stable_prod_grad_) { - bool initialized = false; - for (int j = 0; j < bottom.size(); ++j) { - if (i == j) { continue; } - if (!initialized) { - caffe_copy(count, bottom[j]->gpu_data(), bottom_diff); - initialized = true; - } else { - caffe_gpu_mul(count, bottom[j]->gpu_data(), bottom_diff, - bottom_diff); - } - } - } else { - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - } - caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); - break; - case EltwiseParameter_EltwiseOp_SUM: - if (coeffs_[i] == Dtype(1.)) { - caffe_copy(count, top_diff, bottom_diff); - } else { - caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); - } - break; - case EltwiseParameter_EltwiseOp_MAX: - mask = max_idx_.gpu_data(); - MaxBackward // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - count, top_diff, i, mask, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown elementwise operation."; - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(EltwiseLayer); - -} // namespace caffe diff --git a/src/caffe/layers/euclidean_loss_layer.cpp b/src/caffe/layers/euclidean_loss_layer.cpp index 80efa31b..ea78484b 100644 --- a/src/caffe/layers/euclidean_loss_layer.cpp +++ b/src/caffe/layers/euclidean_loss_layer.cpp @@ -8,9 +8,9 @@ namespace caffe { template -void EuclideanLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer::Reshape(bottom, top); +void EuclideanLossLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1)) << "Inputs must have the same dimension."; diff_.ReshapeLike(*bottom[0]); @@ -20,10 +20,7 @@ template void EuclideanLossLayer::Forward_cpu(const vector*>& bottom, const vector*>& top) { int count = bottom[0]->count(); - caffe_sub( - count, - bottom[0]->cpu_data(), - bottom[1]->cpu_data(), + caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), diff_.mutable_cpu_data()); Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data()); Dtype loss = dot / bottom[0]->num() / Dtype(2); @@ -37,8 +34,7 @@ void EuclideanLossLayer::Backward_cpu(const vector*>& top, if (propagate_down[i]) { const Dtype sign = (i == 0) ? 1 : -1; const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); - caffe_cpu_axpby( - bottom[i]->count(), // count + caffe_cpu_axpby(bottom[i]->count(), // count alpha, // alpha diff_.cpu_data(), // a Dtype(0), // beta @@ -47,11 +43,42 @@ void EuclideanLossLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + int count = bottom[0]->count(); + caffe_gpu_sub(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), + diff_.mutable_gpu_data()); + Dtype dot; + caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); + Dtype loss = dot / bottom[0]->num() / Dtype(2); + top[0]->mutable_cpu_data()[0] = loss; +} + +template +void EuclideanLossLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < 2; ++i) { + if (propagate_down[i]) { + const Dtype sign = (i == 0) ? 1 : -1; + const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); + caffe_gpu_axpby(bottom[i]->count(), // count + alpha, // alpha + diff_.gpu_data(), // a + Dtype(0), // beta + bottom[i]->mutable_gpu_diff()); // b + } + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(EuclideanLossLayer); #endif -INSTANTIATE_CLASS(EuclideanLossLayer); -REGISTER_LAYER_CLASS(EuclideanLoss); +INSTANTIATE_CLASS (EuclideanLossLayer); +REGISTER_LAYER_CLASS (EuclideanLoss); } // namespace caffe diff --git a/src/caffe/layers/euclidean_loss_layer.cu b/src/caffe/layers/euclidean_loss_layer.cu deleted file mode 100644 index 5b1de3ad..00000000 --- a/src/caffe/layers/euclidean_loss_layer.cu +++ /dev/null @@ -1,44 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void EuclideanLossLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int count = bottom[0]->count(); - caffe_gpu_sub( - count, - bottom[0]->gpu_data(), - bottom[1]->gpu_data(), - diff_.mutable_gpu_data()); - Dtype dot; - caffe_gpu_dot(count, diff_.gpu_data(), diff_.gpu_data(), &dot); - Dtype loss = dot / bottom[0]->num() / Dtype(2); - top[0]->mutable_cpu_data()[0] = loss; -} - -template -void EuclideanLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < 2; ++i) { - if (propagate_down[i]) { - const Dtype sign = (i == 0) ? 1 : -1; - const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); - caffe_gpu_axpby( - bottom[i]->count(), // count - alpha, // alpha - diff_.gpu_data(), // a - Dtype(0), // beta - bottom[i]->mutable_gpu_diff()); // b - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(EuclideanLossLayer); - -} // namespace caffe diff --git a/src/caffe/layers/exp_layer.cpp b/src/caffe/layers/exp_layer.cpp index c7e7c60c..ad40bb1b 100644 --- a/src/caffe/layers/exp_layer.cpp +++ b/src/caffe/layers/exp_layer.cpp @@ -9,8 +9,8 @@ namespace caffe { template void ExpLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); const Dtype base = this->layer_param_.exp_param().base(); if (base != Dtype(-1)) { CHECK_GT(base, 0) << "base must be strictly positive."; @@ -18,10 +18,10 @@ void ExpLayer::LayerSetUp(const vector*>& bottom, // If base == -1, interpret the base as e and set log_base = 1 exactly. // Otherwise, calculate its log explicitly. const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); - CHECK(!isnan(log_base)) - << "NaN result: log(base) = log(" << base << ") = " << log_base; - CHECK(!isinf(log_base)) - << "Inf result: log(base) = log(" << base << ") = " << log_base; + CHECK(!isnan(log_base)) << "NaN result: log(base) = log(" << base << ") = " + << log_base; + CHECK(!isinf(log_base)) << "Inf result: log(base) = log(" << base << ") = " + << log_base; const Dtype input_scale = this->layer_param_.exp_param().scale(); const Dtype input_shift = this->layer_param_.exp_param().shift(); inner_scale_ = log_base * input_scale; @@ -48,7 +48,9 @@ void ExpLayer::Forward_cpu(const vector*>& bottom, template void ExpLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } + if (!propagate_down[0]) { + return; + } const int count = bottom[0]->count(); const Dtype* top_data = top[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -59,11 +61,47 @@ void ExpLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +// begin: code modified for OpenCL port +#ifndef CPU_ONLY +template +void ExpLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + if (inner_scale_ == Dtype(1)) { + caffe_gpu_exp(count, bottom_data, top_data); + } else { + caffe_gpu_scale(count, inner_scale_, bottom_data, top_data); + caffe_gpu_exp(count, top_data, top_data); + } + if (outer_scale_ != Dtype(1)) { + caffe_gpu_scal(count, outer_scale_, top_data); + } +} + +template +void ExpLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const int count = bottom[0]->count(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_mul(count, top_data, top_diff, bottom_diff); + if (inner_scale_ != Dtype(1)) { + caffe_gpu_scal(count, inner_scale_, bottom_diff); + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(ExpLayer); #endif -INSTANTIATE_CLASS(ExpLayer); -REGISTER_LAYER_CLASS(Exp); +INSTANTIATE_CLASS (ExpLayer); +REGISTER_LAYER_CLASS (Exp); } // namespace caffe diff --git a/src/caffe/layers/exp_layer.cu b/src/caffe/layers/exp_layer.cu deleted file mode 100644 index 2d75d8dd..00000000 --- a/src/caffe/layers/exp_layer.cu +++ /dev/null @@ -1,44 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ExpLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (inner_scale_ == Dtype(1)) { - caffe_gpu_exp(count, bottom_data, top_data); - } else { - caffe_gpu_scale(count, inner_scale_, bottom_data, top_data); - caffe_gpu_exp(count, top_data, top_data); - } - if (outer_scale_ != Dtype(1)) { - caffe_gpu_scal(count, outer_scale_, top_data); - } -} - -template -void ExpLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - const int count = bottom[0]->count(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_mul(count, top_data, top_diff, bottom_diff); - if (inner_scale_ != Dtype(1)) { - caffe_gpu_scal(count, inner_scale_, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ExpLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/filter_layer.cpp b/src/caffe/layers/filter_layer.cpp index be1db32d..884764b4 100644 --- a/src/caffe/layers/filter_layer.cpp +++ b/src/caffe/layers/filter_layer.cpp @@ -9,14 +9,14 @@ namespace caffe { template void FilterLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(top.size(), bottom.size() - 1); first_reshape_ = true; } template void FilterLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // bottom[0...k-1] are the blobs to filter // bottom[last] is the "selector_blob" int selector_index = bottom.size() - 1; @@ -25,8 +25,8 @@ void FilterLayer::Reshape(const vector*>& bottom, << "Selector blob dimensions must be singletons (1), except the first"; } for (int i = 0; i < bottom.size() - 1; ++i) { - CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) << - "Each bottom should have the same 0th dimension as the selector blob"; + CHECK_EQ(bottom[selector_index]->shape(0), bottom[i]->shape(0)) + << "Each bottom should have the same 0th dimension as the selector blob"; } const Dtype* bottom_data_selector = bottom[selector_index]->cpu_data(); @@ -61,7 +61,7 @@ void FilterLayer::Reshape(const vector*>& bottom, template void FilterLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { int new_tops_num = indices_to_forward_.size(); // forward all filtered items for all bottoms but the Selector (bottom[last]) for (int t = 0; t < top.size(); ++t) { @@ -79,10 +79,10 @@ void FilterLayer::Forward_cpu(const vector*>& bottom, template void FilterLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[bottom.size() - 1]) { LOG(FATAL) << this->type() - << "Layer cannot backpropagate to filter index inputs"; + << "Layer cannot backpropagate to filter index inputs"; } for (int i = 0; i < top.size(); i++) { // bottom[last] is the selector and never needs backpropagation @@ -117,11 +117,73 @@ void FilterLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +// begin: code modified for OpenCL port +#ifndef CPU_ONLY +template +void FilterLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + int new_tops_num = indices_to_forward_.size(); + // forward all filtered items for all bottoms but the Selector (bottom[last]) + for (int t = 0; t < top.size(); ++t) { + const Dtype* bottom_data = bottom[t]->gpu_data(); + Dtype* top_data = top[t]->mutable_gpu_data(); + int dim = bottom[t]->count() / bottom[t]->shape(0); + for (int n = 0; n < new_tops_num; ++n) { + int data_offset_top = n * dim; + int data_offset_bottom = indices_to_forward_[n] * dim; + caffe_copy(dim, bottom_data + data_offset_bottom, + top_data + data_offset_top); + } + } +} + +template +void FilterLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[bottom.size() - 1]) { + LOG(FATAL) << this->type() + << "Layer cannot backpropagate to filter index inputs"; + } + for (int i = 0; i < top.size(); ++i) { + // bottom[last] is the selector and never needs backpropagation + // so we can iterate over top vector because top.size() == bottom.size() -1 + if (propagate_down[i]) { + const int dim = top[i]->count() / top[i]->shape(0); + int next_to_backward_offset = 0; + int batch_offset = 0; + int data_offset_bottom = 0; + int data_offset_top = 0; + for (int n = 0; n < bottom[i]->shape(0); ++n) { + if (next_to_backward_offset >= indices_to_forward_.size()) { + // we already visited all items that were been forwarded, so + // just set to zero remaining ones + data_offset_bottom = n * dim; + caffe_gpu_set(dim, Dtype(0), + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } else { + batch_offset = indices_to_forward_[next_to_backward_offset]; + data_offset_bottom = n * dim; + if (n != batch_offset) { // this data was not been forwarded + caffe_gpu_set(dim, Dtype(0), + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } else { // this data was been forwarded + data_offset_top = next_to_backward_offset * dim; + ++next_to_backward_offset; // point to next forwarded item index + caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, + bottom[i]->mutable_gpu_diff() + data_offset_bottom); + } + } + } + } + } +} + +// end: code modified for OpenCL port +#else STUB_GPU(FilterLayer); #endif -INSTANTIATE_CLASS(FilterLayer); -REGISTER_LAYER_CLASS(Filter); +INSTANTIATE_CLASS (FilterLayer); +REGISTER_LAYER_CLASS (Filter); } // namespace caffe diff --git a/src/caffe/layers/filter_layer.cu b/src/caffe/layers/filter_layer.cu deleted file mode 100644 index cf929eee..00000000 --- a/src/caffe/layers/filter_layer.cu +++ /dev/null @@ -1,70 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void FilterLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int new_tops_num = indices_to_forward_.size(); - // forward all filtered items for all bottoms but the Selector (bottom[last]) - for (int t = 0; t < top.size(); ++t) { - const Dtype* bottom_data = bottom[t]->gpu_data(); - Dtype* top_data = top[t]->mutable_gpu_data(); - int dim = bottom[t]->count() / bottom[t]->shape(0); - for (int n = 0; n < new_tops_num; ++n) { - int data_offset_top = n * dim; - int data_offset_bottom = indices_to_forward_[n] * dim; - caffe_copy(dim, bottom_data + data_offset_bottom, - top_data + data_offset_top); - } - } -} - -template -void FilterLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[bottom.size() - 1]) { - LOG(FATAL) << this->type() - << "Layer cannot backpropagate to filter index inputs"; - } - for (int i = 0; i < top.size(); ++i) { - // bottom[last] is the selector and never needs backpropagation - // so we can iterate over top vector because top.size() == bottom.size() -1 - if (propagate_down[i]) { - const int dim = top[i]->count() / top[i]->shape(0); - int next_to_backward_offset = 0; - int batch_offset = 0; - int data_offset_bottom = 0; - int data_offset_top = 0; - for (int n = 0; n < bottom[i]->shape(0); ++n) { - if (next_to_backward_offset >= indices_to_forward_.size()) { - // we already visited all items that were been forwarded, so - // just set to zero remaining ones - data_offset_bottom = n * dim; - caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { - batch_offset = indices_to_forward_[next_to_backward_offset]; - data_offset_bottom = n * dim; - if (n != batch_offset) { // this data was not been forwarded - caffe_gpu_set(dim, Dtype(0), - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } else { // this data was been forwarded - data_offset_top = next_to_backward_offset * dim; - ++next_to_backward_offset; // point to next forwarded item index - caffe_copy(dim, top[i]->mutable_gpu_diff() + data_offset_top, - bottom[i]->mutable_gpu_diff() + data_offset_bottom); - } - } - } - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(FilterLayer); - -} // namespace caffe diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp index f7e5c9c2..997f213d 100644 --- a/src/caffe/layers/flatten_layer.cpp +++ b/src/caffe/layers/flatten_layer.cpp @@ -8,7 +8,7 @@ namespace caffe { template void FlattenLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int start_axis = bottom[0]->CanonicalAxisIndex( this->layer_param_.flatten_param().axis()); const int end_axis = bottom[0]->CanonicalAxisIndex( @@ -28,17 +28,17 @@ void FlattenLayer::Reshape(const vector*>& bottom, template void FlattenLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { top[0]->ShareData(*bottom[0]); } template void FlattenLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { bottom[0]->ShareDiff(*top[0]); } -INSTANTIATE_CLASS(FlattenLayer); -REGISTER_LAYER_CLASS(Flatten); +INSTANTIATE_CLASS (FlattenLayer); +REGISTER_LAYER_CLASS (Flatten); } // namespace caffe diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 8a782f7e..c87304b0 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -1,11 +1,11 @@ /* -TODO: -- load file in a separate thread ("prefetch") -- can be smarter about the memcpy call instead of doing it row-by-row - :: use util functions caffe_copy, and Blob->offset() - :: don't forget to update hdf5_daa_layer.cu accordingly -- add ability to shuffle filenames if flag is set -*/ + TODO: + - load file in a separate thread ("prefetch") + - can be smarter about the memcpy call instead of doing it row-by-row + :: use util functions caffe_copy, and Blob->offset() + :: don't forget to update hdf5_daa_layer.cu accordingly + - add ability to shuffle filenames if flag is set + */ #include // NOLINT(readability/streams) #include #include @@ -21,7 +21,8 @@ namespace caffe { template -HDF5DataLayer::~HDF5DataLayer() { } +HDF5DataLayer::~HDF5DataLayer() { +} // Load data and label from HDF5 filename into the class property blobs. template @@ -39,7 +40,7 @@ void HDF5DataLayer::LoadHDF5FileData(const char* filename) { const int MAX_DATA_DIM = INT_MAX; for (int i = 0; i < top_size; ++i) { - hdf_blobs_[i] = shared_ptr >(new Blob()); + hdf_blobs_[i] = shared_ptr < Blob > (new Blob()); hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(), MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get()); } @@ -63,7 +64,7 @@ void HDF5DataLayer::LoadHDF5FileData(const char* filename) { if (this->layer_param_.hdf5_data_param().shuffle()) { std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) - << " rows (shuffled)"; + << " rows (shuffled)"; } else { DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows"; } @@ -71,10 +72,10 @@ void HDF5DataLayer::LoadHDF5FileData(const char* filename) { template void HDF5DataLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // Refuse transformation parameters since HDF5 is totally generic. - CHECK(!this->layer_param_.has_transform_param()) << - this->type() << " does not transform data."; + CHECK(!this->layer_param_.has_transform_param()) << this->type() + << " does not transform data."; // Read the source to parse the filenames. const string& source = this->layer_param_.hdf5_data_param().source(); LOG(INFO) << "Loading list of HDF5 filenames from: " << source; @@ -93,7 +94,7 @@ void HDF5DataLayer::LayerSetUp(const vector*>& bottom, current_file_ = 0; LOG(INFO) << "Number of HDF5 files: " << num_files_; CHECK_GE(num_files_, 1) << "Must have at least 1 HDF5 filename listed in " - << source; + << source; file_permutation_.clear(); file_permutation_.resize(num_files_); @@ -127,7 +128,7 @@ void HDF5DataLayer::LayerSetUp(const vector*>& bottom, template void HDF5DataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); for (int i = 0; i < batch_size; ++i, ++current_row_) { if (current_row_ == hdf_blobs_[0]->shape(0)) { @@ -137,7 +138,7 @@ void HDF5DataLayer::Forward_cpu(const vector*>& bottom, current_file_ = 0; if (this->layer_param_.hdf5_data_param().shuffle()) { std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); + file_permutation_.end()); } DLOG(INFO) << "Looping around to first file."; } @@ -151,17 +152,58 @@ void HDF5DataLayer::Forward_cpu(const vector*>& bottom, for (int j = 0; j < this->layer_param_.top_size(); ++j) { int data_dim = top[j]->count() / top[j]->shape(0); caffe_copy(data_dim, - &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] - * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); + &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] * data_dim], + &top[j]->mutable_cpu_data()[i * data_dim]); + } + } +} + +// begin: code modified for OpenCL port +#ifndef CPU_ONLY +template +void HDF5DataLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); + for (int i = 0; i < batch_size; ++i, ++current_row_) { + if (current_row_ == hdf_blobs_[0]->shape(0)) { + if (num_files_ > 1) { + current_file_ += 1; + if (current_file_ == num_files_) { + current_file_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) { + std::random_shuffle(file_permutation_.begin(), + file_permutation_.end()); + } + DLOG(INFO) << "Looping around to first file."; + } + LoadHDF5FileData( + hdf_filenames_[file_permutation_[current_file_]].c_str()); + } + current_row_ = 0; + if (this->layer_param_.hdf5_data_param().shuffle()) + std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); + } + for (int j = 0; j < this->layer_param_.top_size(); ++j) { + int data_dim = top[j]->count() / top[j]->shape(0); + OCL_CHECK( + clEnqueueWriteBuffer(amdDevice.CommandQueue, + (cl_mem) top[j]->mutable_gpu_data(), CL_TRUE, + i * data_dim * sizeof(Dtype), sizeof(Dtype) * data_dim, + &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + * data_dim], 0, NULL, NULL)); + //caffe_copy(data_dim, + // &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] + // * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]); } } } +// end: code modified for OpenCL port -#ifdef CPU_ONLY +#else STUB_GPU_FORWARD(HDF5DataLayer, Forward); #endif -INSTANTIATE_CLASS(HDF5DataLayer); -REGISTER_LAYER_CLASS(HDF5Data); +INSTANTIATE_CLASS (HDF5DataLayer); +REGISTER_LAYER_CLASS (HDF5Data); } // namespace caffe diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu deleted file mode 100644 index 5e3e4ced..00000000 --- a/src/caffe/layers/hdf5_data_layer.cu +++ /dev/null @@ -1,53 +0,0 @@ -/* -TODO: -- only load parts of the file, in accordance with a prototxt param "max_mem" -*/ - -#include -#include -#include - -#include "hdf5.h" -#include "hdf5_hl.h" - -#include "caffe/data_layers.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" - -namespace caffe { - -template -void HDF5DataLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int batch_size = this->layer_param_.hdf5_data_param().batch_size(); - for (int i = 0; i < batch_size; ++i, ++current_row_) { - if (current_row_ == hdf_blobs_[0]->shape(0)) { - if (num_files_ > 1) { - current_file_ += 1; - if (current_file_ == num_files_) { - current_file_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) { - std::random_shuffle(file_permutation_.begin(), - file_permutation_.end()); - } - DLOG(INFO) << "Looping around to first file."; - } - LoadHDF5FileData( - hdf_filenames_[file_permutation_[current_file_]].c_str()); - } - current_row_ = 0; - if (this->layer_param_.hdf5_data_param().shuffle()) - std::random_shuffle(data_permutation_.begin(), data_permutation_.end()); - } - for (int j = 0; j < this->layer_param_.top_size(); ++j) { - int data_dim = top[j]->count() / top[j]->shape(0); - caffe_copy(data_dim, - &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_] - * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(HDF5DataLayer); - -} // namespace caffe diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp index f63375c3..0005fb94 100644 --- a/src/caffe/layers/hdf5_output_layer.cpp +++ b/src/caffe/layers/hdf5_output_layer.cpp @@ -16,7 +16,7 @@ void HDF5OutputLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { file_name_ = this->layer_param_.hdf5_output_param().file_name(); file_id_ = H5Fcreate(file_name_.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, - H5P_DEFAULT); + H5P_DEFAULT); CHECK_GE(file_id_, 0) << "Failed to open HDF5 file" << file_name_; file_opened_ = true; } @@ -33,8 +33,8 @@ template void HDF5OutputLayer::SaveBlobs() { // TODO: no limit on the number of blobs LOG(INFO) << "Saving HDF5 file " << file_name_; - CHECK_EQ(data_blob_.num(), label_blob_.num()) << - "data blob and label blob must have the same batch size"; + CHECK_EQ(data_blob_.num(), label_blob_.num()) + << "data blob and label blob must have the same batch size"; hdf5_save_nd_dataset(file_id_, HDF5_DATA_DATASET_NAME, data_blob_); hdf5_save_nd_dataset(file_id_, HDF5_DATA_LABEL_NAME, label_blob_); LOG(INFO) << "Successfully saved " << data_blob_.num() << " rows"; @@ -42,13 +42,13 @@ void HDF5OutputLayer::SaveBlobs() { template void HDF5OutputLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_GE(bottom.size(), 2); CHECK_EQ(bottom[0]->num(), bottom[1]->num()); data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); + bottom[0]->height(), bottom[0]->width()); label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); + bottom[1]->height(), bottom[1]->width()); const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); @@ -63,15 +63,53 @@ void HDF5OutputLayer::Forward_cpu(const vector*>& bottom, template void HDF5OutputLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { + return; +} + +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + CHECK_GE(bottom.size(), 2); + CHECK_EQ(bottom[0]->num(), bottom[1]->num()); + data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); + label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), + bottom[1]->height(), bottom[1]->width()); + const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); + const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); + + for (int i = 0; i < bottom[0]->num(); ++i) { + OCL_CHECK( + clEnqueueReadBuffer(amdDevice.CommandQueue, + (cl_mem) bottom[0]->gpu_data(), CL_TRUE, + i * data_datum_dim * sizeof(Dtype), sizeof(Dtype) * data_datum_dim, + &data_blob_.mutable_cpu_data()[i * data_datum_dim], 0, NULL, NULL)); + OCL_CHECK( + clEnqueueReadBuffer(amdDevice.CommandQueue, + (cl_mem) bottom[1]->gpu_data(), CL_TRUE, + i * label_datum_dim * sizeof(Dtype), + sizeof(Dtype) * label_datum_dim, + &label_blob_.mutable_cpu_data()[i * label_datum_dim], 0, NULL, + NULL)); + } + SaveBlobs(); +} + +template +void HDF5OutputLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { return; } +// end: code modified for OpenCL port -#ifdef CPU_ONLY +#else STUB_GPU(HDF5OutputLayer); #endif -INSTANTIATE_CLASS(HDF5OutputLayer); -REGISTER_LAYER_CLASS(HDF5Output); +INSTANTIATE_CLASS (HDF5OutputLayer); +REGISTER_LAYER_CLASS (HDF5Output); } // namespace caffe diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/hdf5_output_layer.cu deleted file mode 100644 index ae497c34..00000000 --- a/src/caffe/layers/hdf5_output_layer.cu +++ /dev/null @@ -1,43 +0,0 @@ -#include - -#include "hdf5.h" -#include "hdf5_hl.h" - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/io.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void HDF5OutputLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - CHECK_GE(bottom.size(), 2); - CHECK_EQ(bottom[0]->num(), bottom[1]->num()); - data_blob_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - label_blob_.Reshape(bottom[1]->num(), bottom[1]->channels(), - bottom[1]->height(), bottom[1]->width()); - const int data_datum_dim = bottom[0]->count() / bottom[0]->num(); - const int label_datum_dim = bottom[1]->count() / bottom[1]->num(); - - for (int i = 0; i < bottom[0]->num(); ++i) { - caffe_copy(data_datum_dim, &bottom[0]->gpu_data()[i * data_datum_dim], - &data_blob_.mutable_cpu_data()[i * data_datum_dim]); - caffe_copy(label_datum_dim, &bottom[1]->gpu_data()[i * label_datum_dim], - &label_blob_.mutable_cpu_data()[i * label_datum_dim]); - } - SaveBlobs(); -} - -template -void HDF5OutputLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - return; -} - -INSTANTIATE_LAYER_GPU_FUNCS(HDF5OutputLayer); - -} // namespace caffe diff --git a/src/caffe/layers/hinge_loss_layer.cpp b/src/caffe/layers/hinge_loss_layer.cpp index a2fb2a18..b2259859 100644 --- a/src/caffe/layers/hinge_loss_layer.cpp +++ b/src/caffe/layers/hinge_loss_layer.cpp @@ -26,8 +26,8 @@ void HingeLossLayer::Forward_cpu(const vector*>& bottom, } for (int i = 0; i < num; ++i) { for (int j = 0; j < dim; ++j) { - bottom_diff[i * dim + j] = std::max( - Dtype(0), 1 + bottom_diff[i * dim + j]); + bottom_diff[i * dim + j] = std::max(Dtype(0), + 1 + bottom_diff[i * dim + j]); } } Dtype* loss = top[0]->mutable_cpu_data(); @@ -48,7 +48,7 @@ void HingeLossLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); @@ -76,7 +76,7 @@ void HingeLossLayer::Backward_cpu(const vector*>& top, } } -INSTANTIATE_CLASS(HingeLossLayer); -REGISTER_LAYER_CLASS(HingeLoss); +INSTANTIATE_CLASS (HingeLossLayer); +REGISTER_LAYER_CLASS (HingeLoss); } // namespace caffe diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp index 1c802714..36245446 100644 --- a/src/caffe/layers/im2col_layer.cpp +++ b/src/caffe/layers/im2col_layer.cpp @@ -9,21 +9,24 @@ namespace caffe { template void Im2colLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { ConvolutionParameter conv_param = this->layer_param_.convolution_param(); - CHECK(!conv_param.has_kernel_size() != - !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) + CHECK( + !conv_param.has_kernel_size() + != !(conv_param.has_kernel_h() && conv_param.has_kernel_w())) << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; - CHECK(conv_param.has_kernel_size() || - (conv_param.has_kernel_h() && conv_param.has_kernel_w())) + CHECK( + conv_param.has_kernel_size() + || (conv_param.has_kernel_h() && conv_param.has_kernel_w())) << "For non-square filters both kernel_h and kernel_w are required."; - CHECK((!conv_param.has_pad() && conv_param.has_pad_h() - && conv_param.has_pad_w()) - || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) + CHECK( + (!conv_param.has_pad() && conv_param.has_pad_h() && conv_param.has_pad_w()) + || (!conv_param.has_pad_h() && !conv_param.has_pad_w())) << "pad is pad OR pad_h and pad_w are required."; - CHECK((!conv_param.has_stride() && conv_param.has_stride_h() - && conv_param.has_stride_w()) - || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) + CHECK( + (!conv_param.has_stride() && conv_param.has_stride_h() + && conv_param.has_stride_w()) + || (!conv_param.has_stride_h() && !conv_param.has_stride_w())) << "Stride is stride OR stride_h and stride_w are required."; if (conv_param.has_kernel_size()) { kernel_h_ = kernel_w_ = conv_param.kernel_size(); @@ -49,47 +52,74 @@ void Im2colLayer::LayerSetUp(const vector*>& bottom, template void Im2colLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " << "corresponding to (num, channels, height, width)"; channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); - top[0]->Reshape( - bottom[0]->num(), channels_ * kernel_h_ * kernel_w_, + top[0]->Reshape(bottom[0]->num(), channels_ * kernel_h_ * kernel_w_, (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1, (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1); } template void Im2colLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, top_data + top[0]->offset(n)); + im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_, width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, + top_data + top[0]->offset(n)); } } template void Im2colLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); for (int n = 0; n < top[0]->num(); ++n) { col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n)); + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, + bottom_diff + bottom[0]->offset(n)); } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void Im2colLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + for (int n = 0; n < bottom[0]->num(); ++n) { + im2col_gpu(bottom_data, bottom[0]->offset(n), channels_, height_, width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, top_data, + top[0]->offset(n)); + } +} + +template +void Im2colLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + for (int n = 0; n < top[0]->num(); ++n) { + col2im_gpu(top_diff, top[0]->offset(n), channels_, height_, width_, + kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, bottom_diff, + bottom[0]->offset(n)); + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(Im2colLayer); #endif -INSTANTIATE_CLASS(Im2colLayer); -REGISTER_LAYER_CLASS(Im2col); +INSTANTIATE_CLASS (Im2colLayer); +REGISTER_LAYER_CLASS (Im2col); } // namespace caffe diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu deleted file mode 100644 index 9c338b14..00000000 --- a/src/caffe/layers/im2col_layer.cu +++ /dev/null @@ -1,37 +0,0 @@ -#include - -#include "caffe/common.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/im2col.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void Im2colLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - for (int n = 0; n < bottom[0]->num(); ++n) { - im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_, - width_, kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, top_data + top[0]->offset(n)); - } -} - -template -void Im2colLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - for (int n = 0; n < top[0]->num(); ++n) { - col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, - stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n)); - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(Im2colLayer); - -} // namespace caffe diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp index 18c035cb..21957551 100644 --- a/src/caffe/layers/image_data_layer.cpp +++ b/src/caffe/layers/image_data_layer.cpp @@ -22,15 +22,16 @@ ImageDataLayer::~ImageDataLayer() { template void ImageDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int new_height = this->layer_param_.image_data_param().new_height(); - const int new_width = this->layer_param_.image_data_param().new_width(); - const bool is_color = this->layer_param_.image_data_param().is_color(); + const int new_width = this->layer_param_.image_data_param().new_width(); + const bool is_color = this->layer_param_.image_data_param().is_color(); string root_folder = this->layer_param_.image_data_param().root_folder(); - CHECK((new_height == 0 && new_width == 0) || - (new_height > 0 && new_width > 0)) << "Current implementation requires " - "new_height and new_width to be set at the same time."; + CHECK( + (new_height == 0 && new_width == 0) || (new_height > 0 && new_width > 0)) + << "Current implementation requires " + "new_height and new_width to be set at the same time."; // Read the file with filenames and labels const string& source = this->layer_param_.image_data_param().source(); LOG(INFO) << "Opening file " << source; @@ -53,15 +54,15 @@ void ImageDataLayer::DataLayerSetUp(const vector*>& bottom, lines_id_ = 0; // Check if we would need to randomly skip a few data points if (this->layer_param_.image_data_param().rand_skip()) { - unsigned int skip = caffe_rng_rand() % - this->layer_param_.image_data_param().rand_skip(); + unsigned int skip = caffe_rng_rand() + % this->layer_param_.image_data_param().rand_skip(); LOG(INFO) << "Skipping first " << skip << " data points."; CHECK_GT(lines_.size(), skip) << "Not enough points to skip"; lines_id_ = skip; } // Read an image, and use it to initialize the top blob. cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first, - new_height, new_width, is_color); + new_height, new_width, is_color); // Use data_transformer to infer the expected blob shape from a cv_image. vector top_shape = this->data_transformer_->InferBlobShape(cv_img); this->transformed_data_.Reshape(top_shape); @@ -153,7 +154,7 @@ void ImageDataLayer::InternalThreadEntry() { DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; } -INSTANTIATE_CLASS(ImageDataLayer); -REGISTER_LAYER_CLASS(ImageData); +INSTANTIATE_CLASS (ImageDataLayer); +REGISTER_LAYER_CLASS (ImageData); } // namespace caffe diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp index a1e0b40d..ffd2ab97 100644 --- a/src/caffe/layers/infogain_loss_layer.cpp +++ b/src/caffe/layers/infogain_loss_layer.cpp @@ -11,24 +11,24 @@ namespace caffe { template -void InfogainLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - LossLayer::LayerSetUp(bottom, top); +void InfogainLossLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::LayerSetUp(bottom, top); if (bottom.size() < 3) { CHECK(this->layer_param_.infogain_loss_param().has_source()) << "Infogain matrix source must be specified."; BlobProto blob_proto; - ReadProtoFromBinaryFile( - this->layer_param_.infogain_loss_param().source(), &blob_proto); + ReadProtoFromBinaryFile(this->layer_param_.infogain_loss_param().source(), + &blob_proto); infogain_.FromProto(blob_proto); } } template -void InfogainLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer::Reshape(bottom, top); - Blob* infogain = NULL; +void InfogainLossLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); + Blob < Dtype > *infogain = NULL; if (bottom.size() < 3) { infogain = &infogain_; } else { @@ -45,7 +45,6 @@ void InfogainLossLayer::Reshape( CHECK_EQ(infogain->width(), dim); } - template void InfogainLossLayer::Forward_cpu(const vector*>& bottom, const vector*>& top) { @@ -72,15 +71,14 @@ void InfogainLossLayer::Forward_cpu(const vector*>& bottom, template void InfogainLossLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down.size() > 2 && propagate_down[2]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to infogain inputs."; + << " Layer cannot backpropagate to infogain inputs."; } if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->cpu_data(); @@ -94,7 +92,7 @@ void InfogainLossLayer::Backward_cpu(const vector*>& top, Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); - const Dtype scale = - top[0]->cpu_diff()[0] / num; + const Dtype scale = -top[0]->cpu_diff()[0] / num; for (int i = 0; i < num; ++i) { const int label = static_cast(bottom_label[i]); for (int j = 0; j < dim; ++j) { @@ -105,6 +103,6 @@ void InfogainLossLayer::Backward_cpu(const vector*>& top, } } -INSTANTIATE_CLASS(InfogainLossLayer); -REGISTER_LAYER_CLASS(InfogainLoss); +INSTANTIATE_CLASS (InfogainLossLayer); +REGISTER_LAYER_CLASS (InfogainLoss); } // namespace caffe diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 83c3235e..cfa4246a 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -11,7 +11,7 @@ namespace caffe { template void InnerProductLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int num_output = this->layer_param_.inner_product_param().num_output(); bias_term_ = this->layer_param_.inner_product_param().bias_term(); N_ = num_output; @@ -36,15 +36,19 @@ void InnerProductLayer::LayerSetUp(const vector*>& bottom, weight_shape[1] = K_; this->blobs_[0].reset(new Blob(weight_shape)); // fill the weights - shared_ptr > weight_filler(GetFiller( - this->layer_param_.inner_product_param().weight_filler())); + shared_ptr < Filler + > weight_filler( + GetFiller < Dtype + > (this->layer_param_.inner_product_param().weight_filler())); weight_filler->Fill(this->blobs_[0].get()); // If necessary, intiialize and fill the bias term if (bias_term_) { vector bias_shape(1, N_); this->blobs_[1].reset(new Blob(bias_shape)); - shared_ptr > bias_filler(GetFiller( - this->layer_param_.inner_product_param().bias_filler())); + shared_ptr < Filler + > bias_filler( + GetFiller < Dtype + > (this->layer_param_.inner_product_param().bias_filler())); bias_filler->Fill(this->blobs_[1].get()); } } // parameter initialization @@ -53,7 +57,7 @@ void InnerProductLayer::LayerSetUp(const vector*>& bottom, template void InnerProductLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // Figure out the dimensions const int axis = bottom[0]->CanonicalAxisIndex( this->layer_param_.inner_product_param().axis()); @@ -83,47 +87,84 @@ void InnerProductLayer::Forward_cpu(const vector*>& bottom, const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); const Dtype* weight = this->blobs_[0]->cpu_data(); - caffe_cpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., - bottom_data, weight, (Dtype)0., top_data); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., bottom_data, weight, (Dtype) 0., top_data); if (bias_term_) { - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., - bias_multiplier_.cpu_data(), - this->blobs_[1]->cpu_data(), (Dtype)1., top_data); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype) 1., bias_multiplier_.cpu_data(), this->blobs_[1]->cpu_data(), (Dtype) 1., top_data); } } template void InnerProductLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (this->param_propagate_down_[0]) { const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* bottom_data = bottom[0]->cpu_data(); // Gradient with respect to weight - caffe_cpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff()); + caffe_cpu_gemm < Dtype + > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., top_diff, bottom_data, (Dtype) 1., this->blobs_[0]->mutable_cpu_diff()); } if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->cpu_diff(); // Gradient with respect to bias - caffe_cpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - bias_multiplier_.cpu_data(), (Dtype)1., - this->blobs_[1]->mutable_cpu_diff()); + caffe_cpu_gemv < Dtype + > (CblasTrans, M_, N_, (Dtype) 1., top_diff, bias_multiplier_.cpu_data(), (Dtype) 1., this->blobs_[1]->mutable_cpu_diff()); } if (propagate_down[0]) { const Dtype* top_diff = top[0]->cpu_diff(); // Gradient with respect to bottom data - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, this->blobs_[0]->cpu_data(), (Dtype)0., - bottom[0]->mutable_cpu_diff()); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., top_diff, this->blobs_[0]->cpu_data(), (Dtype) 0., bottom[0]->mutable_cpu_diff()); + } +} + +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void InnerProductLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const Dtype* weight = this->blobs_[0]->gpu_data(); + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype) 1., bottom_data, 0, weight, 0, (Dtype) 0., top_data, 0); + if (bias_term_) { + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasTrans, M_, N_, 1, (Dtype) 1., bias_multiplier_.gpu_data(), 0, this->blobs_[1]->gpu_data(), 0, (Dtype) 1., top_data, 0); + } +} + +template +void InnerProductLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (this->param_propagate_down_[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + // Gradient with respect to weight + caffe_gpu_gemm < Dtype + > (CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype) 1., top_diff, 0, bottom_data, 0, (Dtype) 1., this->blobs_[0]->mutable_gpu_diff(), 0); + } + if (bias_term_ && this->param_propagate_down_[1]) { + const Dtype* top_diff = top[0]->gpu_diff(); + // Gradient with respect to bias + caffe_gpu_gemv < Dtype + > (CblasTrans, M_, N_, (Dtype) 1., (Dtype*) top_diff, (size_t) 0, N_, reinterpret_cast(bias_multiplier_.gpu_data()), (size_t) 0, (Dtype) 0., 1, this->blobs_[1]->mutable_gpu_diff(), (size_t) 0, 1); + } + if (propagate_down[0]) { + const Dtype* top_diff = top[0]->gpu_diff(); + // Gradient with respect to bottom data + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype) 1., top_diff, 0, this->blobs_[0]->gpu_data(), 0, (Dtype) 0., bottom[0]->mutable_gpu_diff(), 0); } } +// end: code modified for OpenCL port -#ifdef CPU_ONLY +#else STUB_GPU(InnerProductLayer); #endif -INSTANTIATE_CLASS(InnerProductLayer); -REGISTER_LAYER_CLASS(InnerProduct); +INSTANTIATE_CLASS (InnerProductLayer); +REGISTER_LAYER_CLASS (InnerProduct); } // namespace caffe diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu deleted file mode 100644 index dd90cac1..00000000 --- a/src/caffe/layers/inner_product_layer.cu +++ /dev/null @@ -1,56 +0,0 @@ -#include - -#include "caffe/blob.hpp" -#include "caffe/common.hpp" -#include "caffe/filler.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void InnerProductLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const Dtype* weight = this->blobs_[0]->gpu_data(); - caffe_gpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., - bottom_data, weight, (Dtype)0., top_data); - if (bias_term_) { - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., - bias_multiplier_.gpu_data(), - this->blobs_[1]->gpu_data(), (Dtype)1., top_data); - } -} - -template -void InnerProductLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (this->param_propagate_down_[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Gradient with respect to weight - caffe_gpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff()); - } - if (bias_term_ && this->param_propagate_down_[1]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bias - caffe_gpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - bias_multiplier_.gpu_data(), (Dtype)1., - this->blobs_[1]->mutable_gpu_diff()); - } - if (propagate_down[0]) { - const Dtype* top_diff = top[0]->gpu_diff(); - // Gradient with respect to bottom data - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, this->blobs_[0]->gpu_data(), (Dtype)0., - bottom[0]->mutable_gpu_diff()); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(InnerProductLayer); - -} // namespace caffe diff --git a/src/caffe/layers/log_layer.cpp b/src/caffe/layers/log_layer.cpp index 55a227f6..a01c9c18 100644 --- a/src/caffe/layers/log_layer.cpp +++ b/src/caffe/layers/log_layer.cpp @@ -9,8 +9,8 @@ namespace caffe { template void LogLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); const Dtype base = this->layer_param_.log_param().base(); if (base != Dtype(-1)) { CHECK_GT(base, 0) << "base must be strictly positive."; @@ -18,15 +18,15 @@ void LogLayer::LayerSetUp(const vector*>& bottom, // If base == -1, interpret the base as e and set log_base = 1 exactly. // Otherwise, calculate its log explicitly. const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); - CHECK(!isnan(log_base)) - << "NaN result: log(base) = log(" << base << ") = " << log_base; - CHECK(!isinf(log_base)) - << "Inf result: log(base) = log(" << base << ") = " << log_base; + CHECK(!isnan(log_base)) << "NaN result: log(base) = log(" << base << ") = " + << log_base; + CHECK(!isinf(log_base)) << "Inf result: log(base) = log(" << base << ") = " + << log_base; base_scale_ = Dtype(1) / log_base; - CHECK(!isnan(base_scale_)) - << "NaN result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; - CHECK(!isinf(base_scale_)) - << "Inf result: 1/log(base) = 1/log(" << base << ") = " << base_scale_; + CHECK(!isnan(base_scale_)) << "NaN result: 1/log(base) = 1/log(" << base + << ") = " << base_scale_; + CHECK(!isinf(base_scale_)) << "Inf result: 1/log(base) = 1/log(" << base + << ") = " << base_scale_; input_scale_ = this->layer_param_.log_param().scale(); input_shift_ = this->layer_param_.log_param().shift(); backward_num_scale_ = input_scale_ / log_base; @@ -58,7 +58,9 @@ void LogLayer::Forward_cpu(const vector*>& bottom, template void LogLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } + if (!propagate_down[0]) { + return; + } const int count = bottom[0]->count(); const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -77,11 +79,61 @@ void LogLayer::Backward_cpu(const vector*>& top, caffe_mul(count, top_diff, bottom_diff, bottom_diff); } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void LogLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { + caffe_gpu_log(count, bottom_data, top_data); + } else { + caffe_gpu_copy(count, bottom_data, top_data); + if (input_scale_ != Dtype(1)) { + caffe_gpu_scal(count, input_scale_, top_data); + } + if (input_shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, input_shift_, top_data); + } + caffe_gpu_log(count, top_data, top_data); + } + if (base_scale_ != Dtype(1)) { + caffe_gpu_scal(count, base_scale_, top_data); + } +} + +template +void LogLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + const int count = bottom[0]->count(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_copy(count, bottom_data, bottom_diff); + if (input_scale_ != Dtype(1)) { + caffe_gpu_scal(count, input_scale_, bottom_diff); + } + if (input_shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, input_shift_, bottom_diff); + } + caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); + if (backward_num_scale_ != Dtype(1)) { + caffe_gpu_scal(count, backward_num_scale_, bottom_diff); + } + caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); +} +// end: code modified for OpenCL port + +#else STUB_GPU(LogLayer); #endif -INSTANTIATE_CLASS(LogLayer); -REGISTER_LAYER_CLASS(Log); +INSTANTIATE_CLASS (LogLayer); +REGISTER_LAYER_CLASS (Log); } // namespace caffe diff --git a/src/caffe/layers/log_layer.cu b/src/caffe/layers/log_layer.cu deleted file mode 100644 index 847c86cd..00000000 --- a/src/caffe/layers/log_layer.cu +++ /dev/null @@ -1,57 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/neuron_layers.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -void LogLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - if (input_scale_ == Dtype(1) && input_shift_ == Dtype(0)) { - caffe_gpu_log(count, bottom_data, top_data); - } else { - caffe_copy(count, bottom_data, top_data); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, top_data); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, top_data); - } - caffe_gpu_log(count, top_data, top_data); - } - if (base_scale_ != Dtype(1)) { - caffe_gpu_scal(count, base_scale_, top_data); - } -} - -template -void LogLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - const int count = bottom[0]->count(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_copy(count, bottom_data, bottom_diff); - if (input_scale_ != Dtype(1)) { - caffe_gpu_scal(count, input_scale_, bottom_diff); - } - if (input_shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, input_shift_, bottom_diff); - } - caffe_gpu_powx(count, bottom_diff, Dtype(-1), bottom_diff); - if (backward_num_scale_ != Dtype(1)) { - caffe_gpu_scal(count, backward_num_scale_, bottom_diff); - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); -} - -INSTANTIATE_LAYER_GPU_FUNCS(LogLayer); - -} // namespace caffe diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp index 3496a5c2..64abbaa0 100644 --- a/src/caffe/layers/loss_layer.cpp +++ b/src/caffe/layers/loss_layer.cpp @@ -11,8 +11,8 @@ namespace caffe { template -void LossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { +void LossLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { // LossLayers have a non-zero (1) loss by default. if (this->layer_param_.loss_weight_size() == 0) { this->layer_param_.add_loss_weight(Dtype(1)); @@ -20,14 +20,14 @@ void LossLayer::LayerSetUp( } template -void LossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { +void LossLayer::Reshape(const vector*>& bottom, + const vector*>& top) { CHECK_EQ(bottom[0]->num(), bottom[1]->num()) << "The data and label should have the same number."; vector loss_shape(0); // Loss layers output a scalar; 0 axes. top[0]->Reshape(loss_shape); } -INSTANTIATE_CLASS(LossLayer); +INSTANTIATE_CLASS (LossLayer); } // namespace caffe diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp index 36c1ace4..0c91435b 100644 --- a/src/caffe/layers/lrn_layer.cpp +++ b/src/caffe/layers/lrn_layer.cpp @@ -3,20 +3,22 @@ #include "caffe/layer.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/vision_layers.hpp" +#include "caffe/util/ocl_wrapper.hpp" +#include "caffe/util/math_functions.hpp" namespace caffe { template void LRNLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { size_ = this->layer_param_.lrn_param().local_size(); CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local_size"; pre_pad_ = (size_ - 1) / 2; alpha_ = this->layer_param_.lrn_param().alpha(); beta_ = this->layer_param_.lrn_param().beta(); k_ = this->layer_param_.lrn_param().k(); - if (this->layer_param_.lrn_param().norm_region() == - LRNParameter_NormRegion_WITHIN_CHANNEL) { + if (this->layer_param_.lrn_param().norm_region() + == LRNParameter_NormRegion_WITHIN_CHANNEL) { // Set up split_layer_ to use inputs in the numerator and denominator. split_top_vec_.clear(); split_top_vec_.push_back(&product_input_); @@ -68,7 +70,7 @@ void LRNLayer::LayerSetUp(const vector*>& bottom, template void LRNLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " << "corresponding to (num, channels, height, width)"; num_ = bottom[0]->num(); @@ -115,46 +117,46 @@ void LRNLayer::CrossChannelForward_cpu( for (int i = 0; i < scale_.count(); ++i) { scale_data[i] = k_; } - Blob padded_square(1, channels_ + size_ - 1, height_, width_); + Blob < Dtype > padded_square(1, channels_ + size_ - 1, height_, width_); Dtype* padded_square_data = padded_square.mutable_cpu_data(); caffe_set(padded_square.count(), Dtype(0), padded_square_data); Dtype alpha_over_size = alpha_ / size_; // go through the images for (int n = 0; n < num_; ++n) { // compute the padded square - caffe_sqr(channels_ * height_ * width_, - bottom_data + bottom[0]->offset(n), + caffe_sqr(channels_ * height_ * width_, bottom_data + bottom[0]->offset(n), padded_square_data + padded_square.offset(0, pre_pad_)); // Create the first channel scale for (int c = 0; c < size_; ++c) { - caffe_axpy(height_ * width_, alpha_over_size, - padded_square_data + padded_square.offset(0, c), - scale_data + scale_.offset(n, 0)); + caffe_axpy < Dtype + > (height_ * width_, alpha_over_size, padded_square_data + + padded_square.offset(0, c), scale_data + scale_.offset(n, 0)); } for (int c = 1; c < channels_; ++c) { // copy previous scale - caffe_copy(height_ * width_, - scale_data + scale_.offset(n, c - 1), - scale_data + scale_.offset(n, c)); + caffe_copy < Dtype + > (height_ * width_, scale_data + scale_.offset(n, c - 1), scale_data + + scale_.offset(n, c)); // add head - caffe_axpy(height_ * width_, alpha_over_size, - padded_square_data + padded_square.offset(0, c + size_ - 1), - scale_data + scale_.offset(n, c)); + caffe_axpy < Dtype + > (height_ * width_, alpha_over_size, padded_square_data + + padded_square.offset(0, c + size_ - 1), scale_data + + scale_.offset(n, c)); // subtract tail - caffe_axpy(height_ * width_, -alpha_over_size, - padded_square_data + padded_square.offset(0, c - 1), - scale_data + scale_.offset(n, c)); + caffe_axpy < Dtype + > (height_ * width_, -alpha_over_size, padded_square_data + + padded_square.offset(0, c - 1), scale_data + scale_.offset(n, c)); } } // In the end, compute output - caffe_powx(scale_.count(), scale_data, -beta_, top_data); - caffe_mul(scale_.count(), top_data, bottom_data, top_data); + caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, top_data); + caffe_mul < Dtype > (scale_.count(), top_data, bottom_data, top_data); } template -void LRNLayer::WithinChannelForward( - const vector*>& bottom, const vector*>& top) { +void LRNLayer::WithinChannelForward(const vector*>& bottom, + const vector*>& top) { split_layer_->Forward(bottom, split_top_vec_); square_layer_->Forward(square_bottom_vec_, square_top_vec_); pool_layer_->Forward(square_top_vec_, pool_top_vec_); @@ -178,16 +180,15 @@ void LRNLayer::Backward_cpu(const vector*>& top, } template -void LRNLayer::CrossChannelBackward_cpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { +void LRNLayer::CrossChannelBackward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* top_data = top[0]->cpu_data(); const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* scale_data = scale_.cpu_data(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); - Blob padded_ratio(1, channels_ + size_ - 1, height_, width_); - Blob accum_ratio(1, 1, height_, width_); + Blob < Dtype > padded_ratio(1, channels_ + size_ - 1, height_, width_); + Blob < Dtype > accum_ratio(1, 1, height_, width_); Dtype* padded_ratio_data = padded_ratio.mutable_cpu_data(); Dtype* accum_ratio_data = accum_ratio.mutable_cpu_data(); // We hack a little bit by using the diff() to store an additional result @@ -195,65 +196,129 @@ void LRNLayer::CrossChannelBackward_cpu( caffe_set(padded_ratio.count(), Dtype(0), padded_ratio_data); Dtype cache_ratio_value = 2. * alpha_ * beta_ / size_; - caffe_powx(scale_.count(), scale_data, -beta_, bottom_diff); - caffe_mul(scale_.count(), top_diff, bottom_diff, bottom_diff); + caffe_powx < Dtype > (scale_.count(), scale_data, -beta_, bottom_diff); + caffe_mul < Dtype > (scale_.count(), top_diff, bottom_diff, bottom_diff); // go through individual data int inverse_pre_pad = size_ - (size_ + 1) / 2; for (int n = 0; n < num_; ++n) { int block_offset = scale_.offset(n); // first, compute diff_i * y_i / s_i - caffe_mul(channels_ * height_ * width_, - top_diff + block_offset, top_data + block_offset, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); - caffe_div(channels_ * height_ * width_, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad), - scale_data + block_offset, - padded_ratio_data + padded_ratio.offset(0, inverse_pre_pad)); + caffe_mul < Dtype + > (channels_ * height_ * width_, top_diff + block_offset, top_data + + block_offset, padded_ratio_data + + padded_ratio.offset(0, inverse_pre_pad)); + caffe_div < Dtype + > (channels_ * height_ * width_, padded_ratio_data + + padded_ratio.offset(0, inverse_pre_pad), scale_data + + block_offset, padded_ratio_data + + padded_ratio.offset(0, inverse_pre_pad)); // Now, compute the accumulated ratios and the bottom diff caffe_set(accum_ratio.count(), Dtype(0), accum_ratio_data); for (int c = 0; c < size_ - 1; ++c) { - caffe_axpy(height_ * width_, 1., - padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); + caffe_axpy < Dtype + > (height_ * width_, 1., padded_ratio_data + + padded_ratio.offset(0, c), accum_ratio_data); } for (int c = 0; c < channels_; ++c) { - caffe_axpy(height_ * width_, 1., - padded_ratio_data + padded_ratio.offset(0, c + size_ - 1), - accum_ratio_data); + caffe_axpy < Dtype + > (height_ * width_, 1., padded_ratio_data + + padded_ratio.offset(0, c + size_ - 1), accum_ratio_data); // compute bottom diff - caffe_mul(height_ * width_, - bottom_data + top[0]->offset(n, c), - accum_ratio_data, accum_ratio_times_bottom); - caffe_axpy(height_ * width_, -cache_ratio_value, - accum_ratio_times_bottom, bottom_diff + top[0]->offset(n, c)); - caffe_axpy(height_ * width_, -1., - padded_ratio_data + padded_ratio.offset(0, c), accum_ratio_data); + caffe_mul < Dtype + > (height_ * width_, bottom_data + top[0]->offset(n, c), accum_ratio_data, accum_ratio_times_bottom); + caffe_axpy < Dtype + > (height_ * width_, -cache_ratio_value, accum_ratio_times_bottom, bottom_diff + + top[0]->offset(n, c)); + caffe_axpy < Dtype + > (height_ * width_, -1., padded_ratio_data + + padded_ratio.offset(0, c), accum_ratio_data); } } } template -void LRNLayer::WithinChannelBackward( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { +void LRNLayer::WithinChannelBackward(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[0]) { vector product_propagate_down(2, true); product_layer_->Backward(top, product_propagate_down, product_bottom_vec_); power_layer_->Backward(power_top_vec_, propagate_down, pool_top_vec_); pool_layer_->Backward(pool_top_vec_, propagate_down, square_top_vec_); square_layer_->Backward(square_top_vec_, propagate_down, - square_bottom_vec_); + square_bottom_vec_); split_layer_->Backward(split_top_vec_, propagate_down, bottom); } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void LRNLayer::CrossChannelForward_gpu( + const vector*>& bottom, const vector*>& top) { + // First, compute scale + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + Dtype* scale_data = scale_.mutable_gpu_data(); + // We will launch one kernel for each pixel location, and have the kernel + // go through all the channels. + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNFillScale(n_threads, bottom_data, num_, channels_, height_, width_, size_, + alpha_ / size_, k_, scale_data); + n_threads = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeOutput(n_threads, bottom_data, scale_data, -beta_, top_data); +} + +template +void LRNLayer::CrossChannelBackward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + int n_threads = num_ * height_ * width_; + // NOLINT_NEXT_LINE(whitespace/operators) + LRNComputeDiff(n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), + scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, + size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), + bottom[0]->mutable_gpu_diff()); +} + +template +void LRNLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelForward_gpu(bottom, top); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelForward(bottom, top); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } +} + +template +void LRNLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + switch (this->layer_param_.lrn_param().norm_region()) { + case LRNParameter_NormRegion_ACROSS_CHANNELS: + CrossChannelBackward_gpu(top, propagate_down, bottom); + break; + case LRNParameter_NormRegion_WITHIN_CHANNEL: + WithinChannelBackward(top, propagate_down, bottom); + break; + default: + LOG(FATAL) << "Unknown normalization region."; + } +} +// end: code modified for OpenCL port +#else STUB_GPU(LRNLayer); STUB_GPU_FORWARD(LRNLayer, CrossChannelForward); STUB_GPU_BACKWARD(LRNLayer, CrossChannelBackward); #endif -INSTANTIATE_CLASS(LRNLayer); -REGISTER_LAYER_CLASS(LRN); +INSTANTIATE_CLASS (LRNLayer); +REGISTER_LAYER_CLASS (LRN); } // namespace caffe diff --git a/src/caffe/layers/lrn_layer.cu b/src/caffe/layers/lrn_layer.cu deleted file mode 100644 index 001b3c34..00000000 --- a/src/caffe/layers/lrn_layer.cu +++ /dev/null @@ -1,203 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void LRNFillScale(const int nthreads, const Dtype* const in, - const int num, const int channels, const int height, - const int width, const int size, const Dtype alpha_over_size, - const Dtype k, Dtype* const scale) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - const Dtype* const in_off = in + offset; - Dtype* const scale_off = scale + offset; - int head = 0; - const int pre_pad = (size - 1) / 2; - const int post_pad = size - pre_pad - 1; - Dtype accum_scale = 0; - // fill the scale at [n, :, h, w] - // accumulate values - while (head < post_pad && head < channels) { - accum_scale += in_off[head * step] * in_off[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_scale += in_off[head * step] * in_off[head * step]; - if (head - size >= 0) { - accum_scale -= in_off[(head - size) * step] - * in_off[(head - size) * step]; - } - scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_scale -= in_off[(head - size) * step] - * in_off[(head - size) * step]; - } - scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; - ++head; - } - } -} - - -template -void LRNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelForward_gpu(bottom, top); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelForward(bottom, top); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } -} - -// TODO: check if it would be faster to just put it into the previous kernel. -template -__global__ void LRNComputeOutput(const int nthreads, const Dtype* const in, - const Dtype* const scale, const Dtype negative_beta, Dtype* const out) { - CUDA_KERNEL_LOOP(index, nthreads) { - out[index] = in[index] * pow(scale[index], negative_beta); - } -} - -template -void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top) { - // First, compute scale - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - // We will launch one kernel for each pixel location, and have the kernel - // go through all the channels. - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNFillScale<<>>( - n_threads, bottom_data, num_, channels_, height_, width_, size_, - alpha_ / size_, k_, scale_data); - CUDA_POST_KERNEL_CHECK; - n_threads = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeOutput<<>>( - n_threads, bottom_data, scale_data, -beta_, top_data); - CUDA_POST_KERNEL_CHECK; -} -template void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top); -template void LRNLayer::CrossChannelForward_gpu( - const vector*>& bottom, const vector*>& top); - - -template -void LRNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - switch (this->layer_param_.lrn_param().norm_region()) { - case LRNParameter_NormRegion_ACROSS_CHANNELS: - CrossChannelBackward_gpu(top, propagate_down, bottom); - break; - case LRNParameter_NormRegion_WITHIN_CHANNEL: - WithinChannelBackward(top, propagate_down, bottom); - break; - default: - LOG(FATAL) << "Unknown normalization region."; - } -} - -template -__global__ void LRNComputeDiff(const int nthreads, - const Dtype* const bottom_data, const Dtype* const top_data, - const Dtype* const scale, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int size, const Dtype negative_beta, - const Dtype cache_ratio, Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int n = index / width / height; - const int offset = (n * channels * height + h) * width + w; - const int step = height * width; - const Dtype* const bottom_off = bottom_data + offset; - const Dtype* const top_off = top_data + offset; - const Dtype* const scale_off = scale + offset; - const Dtype* const top_diff_off = top_diff + offset; - Dtype* const bottom_diff_off = bottom_diff + offset; - int head = 0; - const int pre_pad = size - (size + 1) / 2; - const int post_pad = size - pre_pad - 1; - Dtype accum_ratio = 0; - // accumulate values - while (head < post_pad && head < channels) { - accum_ratio += top_diff_off[head * step] * top_off[head * step] / - scale_off[head * step]; - ++head; - } - // both add and subtract - while (head < channels) { - accum_ratio += top_diff_off[head * step] * top_off[head * step] / - scale_off[head * step]; - if (head - size >= 0) { - accum_ratio -= top_diff_off[(head - size) * step] * - top_off[(head - size) * step] / scale_off[(head - size) * step]; - } - bottom_diff_off[(head - post_pad) * step] = - top_diff_off[(head - post_pad) * step] - * pow(scale_off[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; - ++head; - } - // subtract only - while (head < channels + post_pad) { - if (head - size >= 0) { - accum_ratio -= top_diff_off[(head - size) * step] * - top_off[(head - size) * step] / scale_off[(head - size) * step]; - } - bottom_diff_off[(head - post_pad) * step] = - top_diff_off[(head - post_pad) * step] - * pow(scale_off[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; - ++head; - } - } -} - -template -void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - int n_threads = num_ * height_ * width_; - // NOLINT_NEXT_LINE(whitespace/operators) - LRNComputeDiff<<>>( - n_threads, bottom[0]->gpu_data(), top[0]->gpu_data(), - scale_.gpu_data(), top[0]->gpu_diff(), num_, channels_, height_, width_, - size_, -beta_, Dtype(2. * alpha_ * beta_ / size_), - bottom[0]->mutable_gpu_diff()); -} -template void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom); -template void LRNLayer::CrossChannelBackward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom); - - - -INSTANTIATE_LAYER_GPU_FUNCS(LRNLayer); - -} // namespace caffe diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp index 42de4198..eff0129c 100644 --- a/src/caffe/layers/memory_data_layer.cpp +++ b/src/caffe/layers/memory_data_layer.cpp @@ -10,15 +10,15 @@ namespace caffe { template void MemoryDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { batch_size_ = this->layer_param_.memory_data_param().batch_size(); channels_ = this->layer_param_.memory_data_param().channels(); height_ = this->layer_param_.memory_data_param().height(); width_ = this->layer_param_.memory_data_param().width(); size_ = channels_ * height_ * width_; - CHECK_GT(batch_size_ * size_, 0) << - "batch_size, channels, height, and width must be specified and" - " positive in memory_data_param"; + CHECK_GT(batch_size_ * size_, 0) + << "batch_size, channels, height, and width must be specified and" + " positive in memory_data_param"; vector label_shape(1, batch_size_); top[0]->Reshape(batch_size_, channels_, height_, width_); top[1]->Reshape(label_shape); @@ -32,12 +32,12 @@ void MemoryDataLayer::DataLayerSetUp(const vector*>& bottom, template void MemoryDataLayer::AddDatumVector(const vector& datum_vector) { - CHECK(!has_new_data_) << - "Can't add data until current data has been consumed."; + CHECK(!has_new_data_) + << "Can't add data until current data has been consumed."; size_t num = datum_vector.size(); CHECK_GT(num, 0) << "There is no datum to add."; - CHECK_EQ(num % batch_size_, 0) << - "The added data must be a multiple of the batch size."; + CHECK_EQ(num % batch_size_, 0) + << "The added data must be a multiple of the batch size."; added_data_.Reshape(num, channels_, height_, width_); added_label_.Reshape(num, 1, 1, 1); // Apply data transformations (mirror, scale, crop...) @@ -57,11 +57,11 @@ template void MemoryDataLayer::AddMatVector(const vector& mat_vector, const vector& labels) { size_t num = mat_vector.size(); - CHECK(!has_new_data_) << - "Can't add mat until current data has been consumed."; + CHECK(!has_new_data_) + << "Can't add mat until current data has been consumed."; CHECK_GT(num, 0) << "There is no mat to add"; - CHECK_EQ(num % batch_size_, 0) << - "The added data must be a multiple of the batch size."; + CHECK_EQ(num % batch_size_, 0) + << "The added data must be a multiple of the batch size."; added_data_.Reshape(num, channels_, height_, width_); added_label_.Reshape(num, 1, 1, 1); // Apply data transformations (mirror, scale, crop...) @@ -95,8 +95,8 @@ void MemoryDataLayer::Reset(Dtype* data, Dtype* labels, int n) { template void MemoryDataLayer::set_batch_size(int new_size) { - CHECK(!has_new_data_) << - "Can't change batch_size until current data has been consumed."; + CHECK(!has_new_data_) + << "Can't change batch_size until current data has been consumed."; batch_size_ = new_size; added_data_.Reshape(batch_size_, channels_, height_, width_); added_label_.Reshape(batch_size_, 1, 1, 1); @@ -104,7 +104,7 @@ void MemoryDataLayer::set_batch_size(int new_size) { template void MemoryDataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK(data_) << "MemoryDataLayer needs to be initalized by calling Reset"; top[0]->Reshape(batch_size_, channels_, height_, width_); top[1]->Reshape(batch_size_, 1, 1, 1); @@ -115,7 +115,7 @@ void MemoryDataLayer::Forward_cpu(const vector*>& bottom, has_new_data_ = false; } -INSTANTIATE_CLASS(MemoryDataLayer); -REGISTER_LAYER_CLASS(MemoryData); +INSTANTIATE_CLASS (MemoryDataLayer); +REGISTER_LAYER_CLASS (MemoryData); } // namespace caffe diff --git a/src/caffe/layers/multinomial_logistic_loss_layer.cpp b/src/caffe/layers/multinomial_logistic_loss_layer.cpp index 4267a594..4d8b69bc 100644 --- a/src/caffe/layers/multinomial_logistic_loss_layer.cpp +++ b/src/caffe/layers/multinomial_logistic_loss_layer.cpp @@ -13,7 +13,7 @@ namespace caffe { template void MultinomialLogisticLossLayer::Reshape( const vector*>& bottom, const vector*>& top) { - LossLayer::Reshape(bottom, top); + LossLayer < Dtype > ::Reshape(bottom, top); CHECK_EQ(bottom[1]->channels(), 1); CHECK_EQ(bottom[1]->height(), 1); CHECK_EQ(bottom[1]->width(), 1); @@ -29,8 +29,7 @@ void MultinomialLogisticLossLayer::Forward_cpu( Dtype loss = 0; for (int i = 0; i < num; ++i) { int label = static_cast(bottom_label[i]); - Dtype prob = std::max( - bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); + Dtype prob = std::max(bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); loss -= log(prob); } top[0]->mutable_cpu_data()[0] = loss / num; @@ -42,7 +41,7 @@ void MultinomialLogisticLossLayer::Backward_cpu( const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->cpu_data(); @@ -51,17 +50,17 @@ void MultinomialLogisticLossLayer::Backward_cpu( int num = bottom[0]->num(); int dim = bottom[0]->count() / bottom[0]->num(); caffe_set(bottom[0]->count(), Dtype(0), bottom_diff); - const Dtype scale = - top[0]->cpu_diff()[0] / num; + const Dtype scale = -top[0]->cpu_diff()[0] / num; for (int i = 0; i < num; ++i) { int label = static_cast(bottom_label[i]); - Dtype prob = std::max( - bottom_data[i * dim + label], Dtype(kLOG_THRESHOLD)); + Dtype prob = std::max(bottom_data[i * dim + label], + Dtype(kLOG_THRESHOLD)); bottom_diff[i * dim + label] = scale / prob; } } } -INSTANTIATE_CLASS(MultinomialLogisticLossLayer); -REGISTER_LAYER_CLASS(MultinomialLogisticLoss); +INSTANTIATE_CLASS (MultinomialLogisticLossLayer); +REGISTER_LAYER_CLASS (MultinomialLogisticLoss); } // namespace caffe diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp index 3e79bddc..d64f5670 100644 --- a/src/caffe/layers/mvn_layer.cpp +++ b/src/caffe/layers/mvn_layer.cpp @@ -9,17 +9,14 @@ namespace caffe { template void MVNLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - mean_.Reshape(bottom[0]->num(), bottom[0]->channels(), - 1, 1); - variance_.Reshape(bottom[0]->num(), bottom[0]->channels(), - 1, 1); - temp_.Reshape(bottom[0]->num(), bottom[0]->channels(), - bottom[0]->height(), bottom[0]->width()); - sum_multiplier_.Reshape(1, 1, - bottom[0]->height(), bottom[0]->width()); + const vector*>& top) { + top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(), bottom[0]->height(), + bottom[0]->width()); + mean_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); + variance_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); + temp_.Reshape(bottom[0]->num(), bottom[0]->channels(), bottom[0]->height(), + bottom[0]->width()); + sum_multiplier_.Reshape(1, 1, bottom[0]->height(), bottom[0]->width()); Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data(); caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data); eps_ = this->layer_param_.mvn_param().eps(); @@ -44,11 +41,10 @@ void MVNLayer::Forward_cpu(const vector*>& bottom, temp_.mutable_cpu_data()); // computes variance using var(X) = E(X^2) - (EX)^2 - caffe_cpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX - caffe_cpu_gemv(CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(), - sum_multiplier_.cpu_data(), 0., - variance_.mutable_cpu_data()); // E(X^2) + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, temp_.cpu_data(), sum_multiplier_.cpu_data(), 0., variance_.mutable_cpu_data()); // E(X^2) caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2), temp_.mutable_cpu_data()); // (EX)^2 caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(), @@ -56,31 +52,28 @@ void MVNLayer::Forward_cpu(const vector*>& bottom, // do mean and variance normalization // subtract mean - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data()); caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); // normalize variance caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5), - variance_.mutable_cpu_data()); + variance_.mutable_cpu_data()); caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data()); - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data()); caffe_div(temp_.count(), top_data, temp_.cpu_data(), top_data); } else { - caffe_cpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); // EX // subtract mean - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data()); caffe_add(temp_.count(), bottom_data, temp_.cpu_data(), top_data); } @@ -88,8 +81,7 @@ void MVNLayer::Forward_cpu(const vector*>& bottom, template void MVNLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* top_data = top[0]->cpu_data(); const Dtype* bottom_data = bottom[0]->cpu_data(); @@ -105,28 +97,24 @@ void MVNLayer::Backward_cpu(const vector*>& top, if (this->layer_param_.mvn_param().normalize_variance()) { caffe_mul(temp_.count(), top_data, top_diff, bottom_diff); - caffe_cpu_gemv(CblasNoTrans, num, dim, 1., bottom_diff, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., - bottom_diff); + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1., bottom_diff, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 0., bottom_diff); caffe_mul(temp_.count(), top_data, bottom_diff, bottom_diff); - caffe_cpu_gemv(CblasNoTrans, num, dim, 1., top_diff, - sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.cpu_data(), sum_multiplier_.cpu_data(), 1., - bottom_diff); + caffe_cpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1., top_diff, sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.cpu_data(), sum_multiplier_.cpu_data(), 1., bottom_diff); caffe_cpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), bottom_diff); // put the squares of bottom into temp_ - caffe_powx(temp_.count(), bottom_data, Dtype(2), - temp_.mutable_cpu_data()); - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., - temp_.mutable_cpu_data()); + caffe_powx(temp_.count(), bottom_data, Dtype(2), temp_.mutable_cpu_data()); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.cpu_data(), sum_multiplier_.cpu_data(), 0., temp_.mutable_cpu_data()); caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff); } else { @@ -134,12 +122,120 @@ void MVNLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void MVNLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + // put the squares of bottom into temp_ + caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2), + temp_.mutable_gpu_data()); + + // computes variance using var(X) = E(X^2) - (EX)^2 + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), sum_multiplier_.gpu_data(), 0., variance_.mutable_gpu_data()); // E(X^2) + caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2), + temp_.mutable_gpu_data()); // (EX)^2 + caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(), + variance_.mutable_gpu_data()); // variance + + // do mean and variance normalization + // subtract mean + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data()); + + caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); + + // normalize variance + caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), + variance_.mutable_gpu_data()); + + caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); + + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data()); + + caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); + } else { + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, bottom_data, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX + + // subtract mean + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data()); + + caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); + } +} + +template +void MVNLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + + int num; + if (this->layer_param_.mvn_param().across_channels()) + num = bottom[0]->num(); + else + num = bottom[0]->num() * bottom[0]->channels(); + + int dim = bottom[0]->count() / num; + + if (this->layer_param_.mvn_param().normalize_variance()) { + caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1., bottom_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., bottom_diff); + caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); + + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1., top_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., bottom_diff); + + caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), + bottom_diff); + + // put the squares of bottom into temp_ + caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), + temp_.mutable_gpu_data()); + + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, 1., variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data()); + + caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); + } else { + caffe_gpu_gemv < Dtype + > (CblasNoTrans, num, dim, 1. / dim, top_diff, sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); + caffe_gpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, num, dim, 1, -1., mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., temp_.mutable_gpu_data()); + caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff); + } +} +// end: code modified for OpenCL port -#ifdef CPU_ONLY +#else STUB_GPU(MVNLayer); #endif -INSTANTIATE_CLASS(MVNLayer); -REGISTER_LAYER_CLASS(MVN); +INSTANTIATE_CLASS (MVNLayer); +REGISTER_LAYER_CLASS (MVN); } // namespace caffe diff --git a/src/caffe/layers/mvn_layer.cu b/src/caffe/layers/mvn_layer.cu deleted file mode 100644 index 3888a0c7..00000000 --- a/src/caffe/layers/mvn_layer.cu +++ /dev/null @@ -1,124 +0,0 @@ -#include -#include - -#include "caffe/common_layers.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -void MVNLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - // put the squares of bottom into temp_ - caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); - - // computes variance using var(X) = E(X^2) - (EX)^2 - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, temp_.gpu_data(), - sum_multiplier_.gpu_data(), 0., - variance_.mutable_gpu_data()); // E(X^2) - caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2), - temp_.mutable_gpu_data()); // (EX)^2 - caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(), - variance_.mutable_gpu_data()); // variance - - // do mean and variance normalization - // subtract mean - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); - - // normalize variance - caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5), - variance_.mutable_gpu_data()); - - caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data()); - - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), top_data, temp_.gpu_data(), top_data); - } else { - caffe_gpu_gemv(CblasNoTrans, num, dim, 1. / dim, bottom_data, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); // EX - - // subtract mean - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, -1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_add(temp_.count(), bottom_data, temp_.gpu_data(), top_data); - } -} - -template -void MVNLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - - int num; - if (this->layer_param_.mvn_param().across_channels()) - num = bottom[0]->num(); - else - num = bottom[0]->num() * bottom[0]->channels(); - - int dim = bottom[0]->count() / num; - - if (this->layer_param_.mvn_param().normalize_variance()) { - caffe_gpu_mul(temp_.count(), top_data, top_diff, bottom_diff); - caffe_gpu_gemv(CblasNoTrans, num, dim, 1., bottom_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 0., - bottom_diff); - caffe_gpu_mul(temp_.count(), top_data, bottom_diff, bottom_diff); - - caffe_gpu_gemv(CblasNoTrans, num, dim, 1., top_diff, - sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data()); - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - mean_.gpu_data(), sum_multiplier_.gpu_data(), 1., - bottom_diff); - - caffe_gpu_axpby(temp_.count(), Dtype(1), top_diff, Dtype(-1. / dim), - bottom_diff); - - // put the squares of bottom into temp_ - caffe_gpu_powx(temp_.count(), bottom_data, Dtype(2), - temp_.mutable_gpu_data()); - - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, 1, 1., - variance_.gpu_data(), sum_multiplier_.gpu_data(), 0., - temp_.mutable_gpu_data()); - - caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff); - } else { - caffe_copy(temp_.count(), top_diff, bottom_diff); - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(MVNLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/neuron_layer.cpp b/src/caffe/layers/neuron_layer.cpp index ba67b438..4fa61aad 100644 --- a/src/caffe/layers/neuron_layer.cpp +++ b/src/caffe/layers/neuron_layer.cpp @@ -7,10 +7,10 @@ namespace caffe { template void NeuronLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { top[0]->ReshapeLike(*bottom[0]); } -INSTANTIATE_CLASS(NeuronLayer); +INSTANTIATE_CLASS (NeuronLayer); } // namespace caffe diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp index c8d41499..812ffbb3 100644 --- a/src/caffe/layers/pooling_layer.cpp +++ b/src/caffe/layers/pooling_layer.cpp @@ -15,27 +15,31 @@ using std::max; template void PoolingLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { PoolingParameter pool_param = this->layer_param_.pooling_param(); if (pool_param.global_pooling()) { - CHECK(!(pool_param.has_kernel_size() || - pool_param.has_kernel_h() || pool_param.has_kernel_w())) - << "With Global_pooling: true Filter size cannot specified"; + CHECK( + !(pool_param.has_kernel_size() || pool_param.has_kernel_h() + || pool_param.has_kernel_w())) + << "With Global_pooling: true Filter size cannot specified"; } else { - CHECK(!pool_param.has_kernel_size() != - !(pool_param.has_kernel_h() && pool_param.has_kernel_w())) - << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; - CHECK(pool_param.has_kernel_size() || - (pool_param.has_kernel_h() && pool_param.has_kernel_w())) - << "For non-square filters both kernel_h and kernel_w are required."; + CHECK( + !pool_param.has_kernel_size() + != !(pool_param.has_kernel_h() && pool_param.has_kernel_w())) + << "Filter size is kernel_size OR kernel_h and kernel_w; not both"; + CHECK( + pool_param.has_kernel_size() + || (pool_param.has_kernel_h() && pool_param.has_kernel_w())) + << "For non-square filters both kernel_h and kernel_w are required."; } - CHECK((!pool_param.has_pad() && pool_param.has_pad_h() - && pool_param.has_pad_w()) - || (!pool_param.has_pad_h() && !pool_param.has_pad_w())) + CHECK( + (!pool_param.has_pad() && pool_param.has_pad_h() && pool_param.has_pad_w()) + || (!pool_param.has_pad_h() && !pool_param.has_pad_w())) << "pad is pad OR pad_h and pad_w are required."; - CHECK((!pool_param.has_stride() && pool_param.has_stride_h() - && pool_param.has_stride_w()) - || (!pool_param.has_stride_h() && !pool_param.has_stride_w())) + CHECK( + (!pool_param.has_stride() && pool_param.has_stride_h() + && pool_param.has_stride_w()) + || (!pool_param.has_stride_h() && !pool_param.has_stride_w())) << "Stride is stride OR stride_h and stride_w are required."; global_pooling_ = pool_param.global_pooling(); if (global_pooling_) { @@ -65,13 +69,14 @@ void PoolingLayer::LayerSetUp(const vector*>& bottom, } if (global_pooling_) { CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1) - << "With Global_pooling: true; only pad = 0 and stride = 1"; + << "With Global_pooling: true; only pad = 0 and stride = 1"; } if (pad_h_ != 0 || pad_w_ != 0) { - CHECK(this->layer_param_.pooling_param().pool() - == PoolingParameter_PoolMethod_AVE - || this->layer_param_.pooling_param().pool() - == PoolingParameter_PoolMethod_MAX) + CHECK( + this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_AVE + || this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_MAX) << "Padding implemented only for average and max pooling."; CHECK_LT(pad_h_, kernel_h_); CHECK_LT(pad_w_, kernel_w_); @@ -80,7 +85,7 @@ void PoolingLayer::LayerSetUp(const vector*>& bottom, template void PoolingLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " << "corresponding to (num, channels, height, width)"; channels_ = bottom[0]->channels(); @@ -90,10 +95,10 @@ void PoolingLayer::Reshape(const vector*>& bottom, kernel_h_ = bottom[0]->height(); kernel_w_ = bottom[0]->width(); } - pooled_height_ = static_cast(ceil(static_cast( - height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1; - pooled_width_ = static_cast(ceil(static_cast( - width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1; + pooled_height_ = static_cast(ceil( + static_cast(height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1; + pooled_width_ = static_cast(ceil( + static_cast(width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1; if (pad_h_ || pad_w_) { // If we have padding, ensure that the last pooling starts strictly // inside the image (instead of at the padding); otherwise clip the last. @@ -106,22 +111,21 @@ void PoolingLayer::Reshape(const vector*>& bottom, CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_); CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_); } - top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_, - pooled_width_); + top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_, pooled_width_); if (top.size() > 1) { top[1]->ReshapeLike(*top[0]); } // If max pooling, we will initialize the vector index part. - if (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_MAX && top.size() == 1) { + if (this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_MAX && top.size() == 1) { max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, pooled_width_); } // If stochastic pooling, we will initialize the random index part. - if (this->layer_param_.pooling_param().pool() == - PoolingParameter_PoolMethod_STOCHASTIC) { + if (this->layer_param_.pooling_param().pool() + == PoolingParameter_PoolMethod_STOCHASTIC) { rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, - pooled_width_); + pooled_width_); } } @@ -129,7 +133,7 @@ void PoolingLayer::Reshape(const vector*>& bottom, // case? template void PoolingLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); const int top_count = top[0]->count(); @@ -231,7 +235,7 @@ void PoolingLayer::Forward_cpu(const vector*>& bottom, template void PoolingLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } @@ -289,8 +293,8 @@ void PoolingLayer::Backward_cpu(const vector*>& top, wend = min(wend, width_); for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - bottom_diff[h * width_ + w] += - top_diff[ph * pooled_width_ + pw] / pool_size; + bottom_diff[h * width_ + w] += top_diff[ph * pooled_width_ + pw] + / pool_size; } } } @@ -309,11 +313,106 @@ void PoolingLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void PoolingLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + //Forward_cpu(bottom, top); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + int count = top[0]->count(); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + int* mask = NULL; + Dtype* top_mask = NULL; + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + if (use_top_mask) { + top_mask = top[1]->mutable_gpu_data(); + } else { + mask = max_idx_.mutable_gpu_data(); + } + // NOLINT_NEXT_LINE(whitespace/operators) + MaxPoolForward(count, bottom_data, bottom[0]->num(), channels_, height_, + width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, + stride_w_, pad_h_, pad_w_, top_data, mask, top_mask); + break; + case PoolingParameter_PoolMethod_AVE: + // NOLINT_NEXT_LINE(whitespace/operators) + AvePoolForward(count, bottom_data, bottom[0]->num(), channels_, height_, + width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, + stride_w_, pad_h_, pad_w_, top_data); + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + if (this->phase_ == TRAIN) { + // We need to create the random index as well. + caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), + rand_idx_.mutable_gpu_data()); + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolForwardTrain(count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, + stride_h_, stride_w_, rand_idx_.mutable_gpu_data(), top_data); + } else { + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolForwardTest(count, bottom_data, bottom[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, + stride_h_, stride_w_, top_data); + } + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } +} -#ifdef CPU_ONLY +template +void PoolingLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + //Backward_cpu(top, propagate_down, bottom); + if (!propagate_down[0]) { + return; + } + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + caffe_gpu_set(count, Dtype(0.), bottom_diff); + // We'll output the mask to top[1] if it's of size >1. + const bool use_top_mask = top.size() > 1; + const int* mask = NULL; + const Dtype* top_mask = NULL; + switch (this->layer_param_.pooling_param().pool()) { + case PoolingParameter_PoolMethod_MAX: + if (use_top_mask) { + top_mask = top[1]->gpu_data(); + } else { + mask = max_idx_.gpu_data(); + } + // NOLINT_NEXT_LINE(whitespace/operators) + MaxPoolBackward(count, top_diff, mask, top_mask, top[0]->num(), channels_, + height_, width_, pooled_height_, pooled_width_, kernel_h_, kernel_w_, + stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); + break; + case PoolingParameter_PoolMethod_AVE: + // NOLINT_NEXT_LINE(whitespace/operators) + AvePoolBackward(count, top_diff, top[0]->num(), channels_, height_, width_, + pooled_height_, pooled_width_, kernel_h_, kernel_w_, stride_h_, + stride_w_, pad_h_, pad_w_, bottom_diff); + break; + case PoolingParameter_PoolMethod_STOCHASTIC: + // NOLINT_NEXT_LINE(whitespace/operators) + StoPoolBackward(count, rand_idx_.gpu_data(), top_diff, top[0]->num(), + channels_, height_, width_, pooled_height_, pooled_width_, kernel_h_, + kernel_w_, stride_h_, stride_w_, bottom_diff); + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } +} +// end: code modified for OpenCL port +#else STUB_GPU(PoolingLayer); #endif -INSTANTIATE_CLASS(PoolingLayer); +INSTANTIATE_CLASS (PoolingLayer); } // namespace caffe diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu deleted file mode 100644 index ca4b13f7..00000000 --- a/src/caffe/layers/pooling_layer.cu +++ /dev/null @@ -1,387 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void MaxPoolForward(const int nthreads, - const Dtype* const bottom_data, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - Dtype* const top_data, int* mask, Dtype* top_mask) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - const int hend = min(hstart + kernel_h, height); - const int wend = min(wstart + kernel_w, width); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - Dtype maxval = -FLT_MAX; - int maxidx = -1; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - if (bottom_slice[h * width + w] > maxval) { - maxidx = h * width + w; - maxval = bottom_slice[maxidx]; - } - } - } - top_data[index] = maxval; - if (mask) { - mask[index] = maxidx; - } else { - top_mask[index] = maxidx; - } - } -} - -template -__global__ void AvePoolForward(const int nthreads, - const Dtype* const bottom_data, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - Dtype* const top_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - const int pool_size = (hend - hstart) * (wend - wstart); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - hend = min(hend, height); - wend = min(wend, width); - Dtype aveval = 0; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - aveval += bottom_slice[h * width + w]; - } - } - top_data[index] = aveval / pool_size; - } -} - -template -__global__ void StoPoolForwardTrain(const int nthreads, - const Dtype* const bottom_data, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, Dtype* const rand_idx, Dtype* const top_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - Dtype cumsum = 0.; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; - } - } - const float thres = rand_idx[index] * cumsum; - // Second pass: get value, and set index. - cumsum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; - if (cumsum >= thres) { - rand_idx[index] = ((n * channels + c) * height + h) * width + w; - top_data[index] = bottom_slice[h * width + w]; - return; - } - } - } - } -} - - -template -__global__ void StoPoolForwardTest(const int nthreads, - const Dtype* const bottom_data, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, Dtype* const top_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int pw = index % pooled_width; - const int ph = (index / pooled_width) % pooled_height; - const int c = (index / pooled_width / pooled_height) % channels; - const int n = index / pooled_width / pooled_height / channels; - const int hstart = ph * stride_h; - const int hend = min(hstart + kernel_h, height); - const int wstart = pw * stride_w; - const int wend = min(wstart + kernel_w, width); - // We set cumsum to be 0 to avoid divide-by-zero problems - Dtype cumsum = FLT_MIN; - Dtype cumvalues = 0.; - const Dtype* const bottom_slice = - bottom_data + (n * channels + c) * height * width; - // First pass: get sum - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - cumsum += bottom_slice[h * width + w]; - cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; - } - } - top_data[index] = cumvalues / cumsum; - } -} - - -template -void PoolingLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - int count = top[0]->count(); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - int* mask = NULL; - Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - if (use_top_mask) { - top_mask = top[1]->mutable_gpu_data(); - } else { - mask = max_idx_.mutable_gpu_data(); - } - // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolForward<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data, - mask, top_mask); - break; - case PoolingParameter_PoolMethod_AVE: - // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolForward<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, top_data); - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - if (this->phase_ == TRAIN) { - // We need to create the random index as well. - caffe_gpu_rng_uniform(count, Dtype(0), Dtype(1), - rand_idx_.mutable_gpu_data()); - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTrain<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, - rand_idx_.mutable_gpu_data(), top_data); - } else { - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolForwardTest<<>>( - count, bottom_data, bottom[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, top_data); - } - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - CUDA_POST_KERNEL_CHECK; -} - - -template -__global__ void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, - const int* const mask, const Dtype* const top_mask, const int num, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, const int kernel_h, - const int kernel_w, const int stride_h, const int stride_w, const int pad_h, - const int pad_w, Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = - (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; - const int phend = min((h + pad_h) / stride_h + 1, pooled_height); - const int pwstart = - (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; - const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); - Dtype gradient = 0; - const int offset = (n * channels + c) * pooled_height * pooled_width; - const Dtype* const top_diff_slice = top_diff + offset; - if (mask) { - const int* const mask_slice = mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (mask_slice[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff_slice[ph * pooled_width + pw]; - } - } - } - } else { - const Dtype* const top_mask_slice = top_mask + offset; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (top_mask_slice[ph * pooled_width + pw] == h * width + w) { - gradient += top_diff_slice[ph * pooled_width + pw]; - } - } - } - } - bottom_diff[index] = gradient; - } -} - -template -__global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, const int pad_h, const int pad_w, - Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local index - // find out the local offset - const int w = index % width + pad_w; - const int h = (index / width) % height + pad_h; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - Dtype gradient = 0; - const Dtype* const top_diff_slice = - top_diff + (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - // figure out the pooling size - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, height + pad_h); - int wend = min(wstart + kernel_w, width + pad_w); - int pool_size = (hend - hstart) * (wend - wstart); - gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; - } - } - bottom_diff[index] = gradient; - } -} - - -template -__global__ void StoPoolBackward(const int nthreads, - const Dtype* const rand_idx, const Dtype* const top_diff, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, Dtype* const bottom_diff) { - CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; - const int phend = min(h / stride_h + 1, pooled_height); - const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; - const int pwend = min(w / stride_w + 1, pooled_width); - Dtype gradient = 0; - const Dtype* const rand_idx_slice = - rand_idx + (n * channels + c) * pooled_height * pooled_width; - const Dtype* const top_diff_slice = - top_diff + (n * channels + c) * pooled_height * pooled_width; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - gradient += top_diff_slice[ph * pooled_width + pw] * - (index == static_cast(rand_idx_slice[ph * pooled_width + pw])); - } - } - bottom_diff[index] = gradient; - } -} - - -template -void PoolingLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { - return; - } - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - caffe_gpu_set(count, Dtype(0.), bottom_diff); - // We'll output the mask to top[1] if it's of size >1. - const bool use_top_mask = top.size() > 1; - const int* mask = NULL; - const Dtype* top_mask = NULL; - switch (this->layer_param_.pooling_param().pool()) { - case PoolingParameter_PoolMethod_MAX: - if (use_top_mask) { - top_mask = top[1]->gpu_data(); - } else { - mask = max_idx_.gpu_data(); - } - // NOLINT_NEXT_LINE(whitespace/operators) - MaxPoolBackward<<>>( - count, top_diff, mask, top_mask, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, - kernel_h_, kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, - bottom_diff); - break; - case PoolingParameter_PoolMethod_AVE: - // NOLINT_NEXT_LINE(whitespace/operators) - AvePoolBackward<<>>( - count, top_diff, top[0]->num(), channels_, - height_, width_, pooled_height_, pooled_width_, kernel_h_, - kernel_w_, stride_h_, stride_w_, pad_h_, pad_w_, bottom_diff); - break; - case PoolingParameter_PoolMethod_STOCHASTIC: - // NOLINT_NEXT_LINE(whitespace/operators) - StoPoolBackward<<>>( - count, rand_idx_.gpu_data(), top_diff, - top[0]->num(), channels_, height_, width_, pooled_height_, - pooled_width_, kernel_h_, kernel_w_, stride_h_, stride_w_, - bottom_diff); - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - CUDA_POST_KERNEL_CHECK; -} - - -INSTANTIATE_LAYER_GPU_FUNCS(PoolingLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/power_layer.cpp b/src/caffe/layers/power_layer.cpp index 4fe34c49..c3cb1759 100644 --- a/src/caffe/layers/power_layer.cpp +++ b/src/caffe/layers/power_layer.cpp @@ -4,17 +4,19 @@ #include "caffe/layer.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/vision_layers.hpp" +#include "caffe/util/ocl_util.hpp" +#include "caffe/util/ocl_wrapper.hpp" namespace caffe { template void PowerLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); power_ = this->layer_param_.power_param().power(); scale_ = this->layer_param_.power_param().scale(); shift_ = this->layer_param_.power_param().shift(); - diff_scale_ = power_ * scale_; + diff_scale_ = power_ * scale_; } // Compute y = (shift + scale * x)^power @@ -44,8 +46,7 @@ void PowerLayer::Forward_cpu(const vector*>& bottom, template void PowerLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[0]) { Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); const int count = bottom[0]->count(); @@ -60,8 +61,8 @@ void PowerLayer::Backward_cpu(const vector*>& top, // Special case for y = (shift + scale * x)^2 // -> dy/dx = 2 * scale * (shift + scale * x) // = diff_scale * shift + diff_scale * scale * x - caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); + caffe_cpu_axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0), + bottom_diff); if (shift_ != Dtype(0)) { caffe_add_scalar(count, diff_scale_ * shift_, bottom_diff); } @@ -82,7 +83,7 @@ void PowerLayer::Backward_cpu(const vector*>& top, caffe_add_scalar(count, shift_, bottom_diff); } const Dtype* top_data = top[0]->cpu_data(); - caffe_div(count, top_data, bottom_diff, bottom_diff); + caffe_div < Dtype > (count, top_data, bottom_diff, bottom_diff); if (diff_scale_ != Dtype(1)) { caffe_scal(count, diff_scale_, bottom_diff); } @@ -94,11 +95,86 @@ void PowerLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void PowerLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // Special case where we can ignore the input: scale or power is 0. + if (diff_scale_ == Dtype(0)) { + Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); + ocl_memset(top_data, value, count); + return; + } + const Dtype* bottom_data = bottom[0]->gpu_data(); + caffe_gpu_copy(count, bottom_data, top_data); + if (scale_ != Dtype(1)) { + caffe_gpu_scal(count, scale_, top_data); + } + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, shift_, top_data); + } + if (power_ != Dtype(1)) { + caffe_gpu_powx(count, top_data, power_, top_data); + } +} + +template +void PowerLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + const Dtype* top_diff = top[0]->gpu_diff(); + if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { + ocl_memset(bottom_diff, diff_scale_, count); + } else { + const Dtype* bottom_data = bottom[0]->gpu_data(); + // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) + // = diff_scale * y / (shift + scale * x) + if (power_ == Dtype(2)) { + // Special case for y = (shift + scale * x)^2 + // -> dy/dx = 2 * scale * (shift + scale * x) + // = diff_scale * shift + diff_scale * scale * x + caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, Dtype(0), + bottom_diff); + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); + } + } else if (shift_ == Dtype(0)) { + // Special case for y = (scale * x)^power + // -> dy/dx = scale * power * (scale * x)^(power - 1) + // = scale * power * (scale * x)^power * (scale * x)^(-1) + // = power * y / x + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_div(count, top_data, bottom_data, bottom_diff); + caffe_gpu_scal(count, power_, bottom_diff); + } else { + caffe_gpu_copy(count, bottom_data, bottom_diff); + if (scale_ != Dtype(1)) { + caffe_gpu_scal(count, scale_, bottom_diff); + } + if (shift_ != Dtype(0)) { + caffe_gpu_add_scalar(count, shift_, bottom_diff); + } + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); + if (diff_scale_ != Dtype(1)) { + caffe_gpu_scal(count, diff_scale_, bottom_diff); + } + } + } + caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); + } +} +// begin: code modified for OpenCL port +#else STUB_GPU(PowerLayer); #endif -INSTANTIATE_CLASS(PowerLayer); -REGISTER_LAYER_CLASS(Power); +INSTANTIATE_CLASS (PowerLayer); +REGISTER_LAYER_CLASS (Power); } // namespace caffe diff --git a/src/caffe/layers/power_layer.cu b/src/caffe/layers/power_layer.cu deleted file mode 100644 index 90d94405..00000000 --- a/src/caffe/layers/power_layer.cu +++ /dev/null @@ -1,87 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void PowerLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // Special case where we can ignore the input: scale or power is 0. - if (diff_scale_ == Dtype(0)) { - Dtype value = (power_ == 0) ? Dtype(1) : pow(shift_, power_); - caffe_gpu_set(count, value, top_data); - return; - } - const Dtype* bottom_data = bottom[0]->gpu_data(); - caffe_copy(count, bottom_data, top_data); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, top_data); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, top_data); - } - if (power_ != Dtype(1)) { - caffe_gpu_powx(count, top_data, power_, top_data); - } -} - -template -void PowerLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - const Dtype* top_diff = top[0]->gpu_diff(); - if (diff_scale_ == Dtype(0) || power_ == Dtype(1)) { - caffe_gpu_set(count, diff_scale_, bottom_diff); - } else { - const Dtype* bottom_data = bottom[0]->gpu_data(); - // Compute dy/dx = scale * power * (shift + scale * x)^(power - 1) - // = diff_scale * y / (shift + scale * x) - if (power_ == Dtype(2)) { - // Special case for y = (shift + scale * x)^2 - // -> dy/dx = 2 * scale * (shift + scale * x) - // = diff_scale * shift + diff_scale * scale * x - caffe_gpu_axpby(count, diff_scale_ * scale_, bottom_data, - Dtype(0), bottom_diff); - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, diff_scale_ * shift_, bottom_diff); - } - } else if (shift_ == Dtype(0)) { - // Special case for y = (scale * x)^power - // -> dy/dx = scale * power * (scale * x)^(power - 1) - // = scale * power * (scale * x)^power * (scale * x)^(-1) - // = power * y / x - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_data, bottom_diff); - caffe_gpu_scal(count, power_, bottom_diff); - } else { - caffe_copy(count, bottom_data, bottom_diff); - if (scale_ != Dtype(1)) { - caffe_gpu_scal(count, scale_, bottom_diff); - } - if (shift_ != Dtype(0)) { - caffe_gpu_add_scalar(count, shift_, bottom_diff); - } - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_div(count, top_data, bottom_diff, bottom_diff); - if (diff_scale_ != Dtype(1)) { - caffe_gpu_scal(count, diff_scale_, bottom_diff); - } - } - } - caffe_gpu_mul(count, top_diff, bottom_diff, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(PowerLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp index 81831755..55f2e303 100644 --- a/src/caffe/layers/prelu_layer.cpp +++ b/src/caffe/layers/prelu_layer.cpp @@ -24,14 +24,14 @@ void PReLULayer::LayerSetUp(const vector*>& bottom, } else { this->blobs_[0].reset(new Blob(vector(1, channels))); } - shared_ptr > filler; + shared_ptr < Filler > filler; if (prelu_param.has_filler()) { - filler.reset(GetFiller(prelu_param.filler())); + filler.reset(GetFiller < Dtype > (prelu_param.filler())); } else { FillerParameter filler_param; filler_param.set_type("constant"); filler_param.set_value(0.25); - filler.reset(GetFiller(filler_param)); + filler.reset(GetFiller < Dtype > (filler_param)); } filler->Fill(this->blobs_[0].get()); } @@ -89,8 +89,7 @@ void PReLULayer::Forward_cpu(const vector*>& bottom, template void PReLULayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* slope_data = this->blobs_[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -123,18 +122,88 @@ void PReLULayer::Backward_cpu(const vector*>& top, Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); for (int i = 0; i < count; ++i) { int c = (i / dim) % channels / div_factor; - bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0) - + slope_data[c] * (bottom_data[i] <= 0)); + bottom_diff[i] = top_diff[i] + * ((bottom_data[i] > 0) + slope_data[c] * (bottom_data[i] <= 0)); } } } +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void PReLULayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + const Dtype* slope_data = this->blobs_[0]->gpu_data(); + const int div_factor = channel_shared_ ? channels : 1; + + if (top[0] == bottom[0]) { + caffe_gpu_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); + } + PReLUForward(count, channels, dim, bottom_data, top_data, slope_data, + div_factor); +} + +template +void PReLULayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + const int count = bottom[0]->count(); + const int dim = bottom[0]->count(2); + const int channels = bottom[0]->channels(); + + if (top[0] == bottom[0]) { + bottom_data = bottom_memory_.gpu_data(); + } + + // Propagate to param + // Since to write bottom diff will affect top diff if top and bottom blobs + // are identical (in-place computaion), we first compute param backward to + // keep top_diff unchanged. + if (this->param_propagate_down_[0]) { + Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff(); + int cdim = channels * dim; + Dtype dsum = 0.; + for (int n = 0; n < bottom[0]->num(); ++n) { + // compute element-wise diff + // NOLINT_NEXT_LINE(whitespace/operators) + PReLUParamBackward(cdim, top_diff, top[0]->offset(n), bottom_data, + bottom[0]->offset(n), backward_buff_.mutable_gpu_diff()); + if (channel_shared_) { + Dtype d; + caffe_gpu_dot < Dtype + > (channels * dim, backward_buff_.gpu_diff(), multiplier_.gpu_data(), &d); + dsum += d; + } else { + caffe_gpu_gemv < Dtype + > (CblasNoTrans, channels, dim, 1., backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., slope_diff); + } + } + if (channel_shared_) { + caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff); + } + } + // Propagate to bottom + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const Dtype* slope_data = this->blobs_[0]->gpu_data(); + int div_factor = channel_shared_ ? channels : 1; + // NOLINT_NEXT_LINE(whitespace/operators) + PReLUBackward(count, channels, dim, top_diff, bottom_data, bottom_diff, + slope_data, div_factor); + } +} +// end: code modified for OpenCL port -#ifdef CPU_ONLY +#else STUB_GPU(PReLULayer); #endif -INSTANTIATE_CLASS(PReLULayer); -REGISTER_LAYER_CLASS(PReLU); - +INSTANTIATE_CLASS (PReLULayer); +REGISTER_LAYER_CLASS (PReLU); } // namespace caffe diff --git a/src/caffe/layers/prelu_layer.cu b/src/caffe/layers/prelu_layer.cu deleted file mode 100644 index e1f20048..00000000 --- a/src/caffe/layers/prelu_layer.cu +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -// CUDA kernele for forward -template -__global__ void PReLUForward(const int n, const int channels, const int dim, - const Dtype* in, Dtype* out, const Dtype* slope_data, - const int div_factor) { - CUDA_KERNEL_LOOP(index, n) { - int c = (index / dim) % channels / div_factor; - out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; - } -} - -// CUDA kernel for bottom backward -template -__global__ void PReLUBackward(const int n, const int channels, const int dim, - const Dtype* in_diff, const Dtype* in_data, Dtype* out_diff, - const Dtype* slope_data, const int div_factor) { - CUDA_KERNEL_LOOP(index, n) { - int c = (index / dim) % channels / div_factor; - out_diff[index] = in_diff[index] * ((in_data[index] > 0) - + (in_data[index] <= 0) * slope_data[c]); - } -} - -// CUDA kernel for element-wise parameter backward -template -__global__ void PReLUParamBackward(const int n, const Dtype* in_diff, - const Dtype* in_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); - } -} - -template -void PReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - const Dtype* slope_data = this->blobs_[0]->gpu_data(); - const int div_factor = channel_shared_ ? channels : 1; - - // For in-place computation - if (top[0] == bottom[0]) { - caffe_copy(count, bottom_data, bottom_memory_.mutable_gpu_data()); - } - - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUForward<<>>( - count, channels, dim, bottom_data, top_data, slope_data, div_factor); - CUDA_POST_KERNEL_CHECK; -} - -template -void PReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - const int count = bottom[0]->count(); - const int dim = bottom[0]->count(2); - const int channels = bottom[0]->channels(); - - // For in-place computation - if (top[0] == bottom[0]) { - bottom_data = bottom_memory_.gpu_data(); - } - - // Propagate to param - // Since to write bottom diff will affect top diff if top and bottom blobs - // are identical (in-place computaion), we first compute param backward to - // keep top_diff unchanged. - if (this->param_propagate_down_[0]) { - Dtype* slope_diff = this->blobs_[0]->mutable_gpu_diff(); - int cdim = channels * dim; - Dtype dsum = 0.; - for (int n = 0; n < bottom[0]->num(); ++n) { - // compute element-wise diff - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUParamBackward<<>>( - cdim, top_diff + top[0]->offset(n), - bottom_data + bottom[0]->offset(n), - backward_buff_.mutable_gpu_diff()); - CUDA_POST_KERNEL_CHECK; - if (channel_shared_) { - Dtype d; - caffe_gpu_dot(channels * dim, backward_buff_.gpu_diff(), - multiplier_.gpu_data(), &d); - dsum += d; - } else { - caffe_gpu_gemv(CblasNoTrans, channels, dim, 1., - backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1., - slope_diff); - } - } - if (channel_shared_) { - caffe_gpu_add_scalar(this->blobs_[0]->count(), Dtype(dsum), slope_diff); - } - } - // Propagate to bottom - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const Dtype* slope_data = this->blobs_[0]->gpu_data(); - int div_factor = channel_shared_ ? channels : 1; - // NOLINT_NEXT_LINE(whitespace/operators) - PReLUBackward<<>>( - count, channels, dim, top_diff, bottom_data, bottom_diff, slope_data, - div_factor); - CUDA_POST_KERNEL_CHECK; - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(PReLULayer); - - -} // namespace caffe diff --git a/src/caffe/layers/reduction_layer.cpp b/src/caffe/layers/reduction_layer.cpp index 8ae6329e..ace74b28 100644 --- a/src/caffe/layers/reduction_layer.cpp +++ b/src/caffe/layers/reduction_layer.cpp @@ -10,13 +10,13 @@ namespace caffe { template void ReductionLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { op_ = this->layer_param_.reduction_param().operation(); } template void ReductionLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { axis_ = bottom[0]->CanonicalAxisIndex( this->layer_param_.reduction_param().axis()); // In the output, we'll keep all axes up to the reduction axis, but @@ -24,13 +24,13 @@ void ReductionLayer::Reshape(const vector*>& bottom, // Note: currently reducing along non-tail axes is not supported; otherwise, // we'd need to also copy any axes following an "end_axis". vector top_shape(bottom[0]->shape().begin(), - bottom[0]->shape().begin() + axis_); + bottom[0]->shape().begin() + axis_); top[0]->Reshape(top_shape); num_ = bottom[0]->count(0, axis_); dim_ = bottom[0]->count(axis_); CHECK_EQ(num_, top[0]->count()); - if (op_ == ReductionParameter_ReductionOp_SUM || - op_ == ReductionParameter_ReductionOp_MEAN) { + if (op_ == ReductionParameter_ReductionOp_SUM + || op_ == ReductionParameter_ReductionOp_MEAN) { vector sum_mult_shape(1, dim_); sum_multiplier_.Reshape(sum_mult_shape); caffe_set(dim_, Dtype(1), sum_multiplier_.mutable_cpu_data()); @@ -42,8 +42,8 @@ void ReductionLayer::Reshape(const vector*>& bottom, } template -void ReductionLayer::Forward_cpu( - const vector*>& bottom, const vector*>& top) { +void ReductionLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* mult_data = NULL; if (sum_multiplier_.count() > 0) { @@ -79,7 +79,9 @@ void ReductionLayer::Forward_cpu( template void ReductionLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } + if (!propagate_down[0]) { + return; + } // Get bottom_data, if needed. const Dtype* bottom_data = NULL; switch (op_) { @@ -87,7 +89,7 @@ void ReductionLayer::Backward_cpu(const vector*>& top, case ReductionParameter_ReductionOp_SUM: case ReductionParameter_ReductionOp_MEAN: break; - // Operations that need bottom_data + // Operations that need bottom_data case ReductionParameter_ReductionOp_ASUM: case ReductionParameter_ReductionOp_SUMSQ: bottom_data = bottom[0]->cpu_data(); @@ -122,11 +124,102 @@ void ReductionLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void ReductionLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + //Forward_cpu(bottom, top); +//return; + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* mult_data = NULL; + if (sum_multiplier_.count() > 0) { + mult_data = sum_multiplier_.gpu_data(); + } + Dtype* top_data = top[0]->mutable_cpu_data(); + size_t bottom_offset = 0; + for (int i = 0; i < num_; ++i) { + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_gpu_dot(dim_, mult_data, 0, bottom_data, bottom_offset, top_data); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_gpu_asum(dim_, bottom_data, bottom_offset, top_data); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_gpu_dot(dim_, bottom_data, bottom_offset, bottom_data, bottom_offset, top_data); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_offset += dim_; + ++top_data; + } + if (coeff_ != Dtype(1)) { + // Reset the top_data pointer. + top_data = top[0]->mutable_gpu_data(); + caffe_gpu_scal(num_, coeff_, top_data); + } +} + +template +void ReductionLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + // Get bottom_data, if needed. + const Dtype* bottom_data = NULL; + switch (op_) { + // Operations that don't need bottom_data + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + break; + // Operations that need bottom_data + case ReductionParameter_ReductionOp_ASUM: + case ReductionParameter_ReductionOp_SUMSQ: + bottom_data = bottom[0]->gpu_data(); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + int bottom_data_offset = 0; + int bottom_diff_offset = 0; + for (int i = 0; i < num_; ++i) { + const Dtype bottom_coeff = (*top_diff) * coeff_; + switch (op_) { + case ReductionParameter_ReductionOp_SUM: + case ReductionParameter_ReductionOp_MEAN: + caffe_gpu_set(dim_, bottom_coeff, bottom_diff, bottom_diff_offset); + break; + case ReductionParameter_ReductionOp_ASUM: + caffe_gpu_sign(dim_, bottom_data, bottom_data_offset, bottom_diff, bottom_diff_offset); + caffe_gpu_scal(dim_, bottom_coeff, bottom_diff, bottom_diff_offset); + break; + case ReductionParameter_ReductionOp_SUMSQ: + caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_data_offset, bottom_diff, bottom_diff_offset); + break; + default: + LOG(FATAL) << "Unknown reduction op: " + << ReductionParameter_ReductionOp_Name(op_); + } + bottom_data_offset += dim_; + bottom_diff_offset += dim_; + ++top_diff; + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(ReductionLayer); #endif -INSTANTIATE_CLASS(ReductionLayer); -REGISTER_LAYER_CLASS(Reduction); +INSTANTIATE_CLASS (ReductionLayer); +REGISTER_LAYER_CLASS (Reduction); } // namespace caffe diff --git a/src/caffe/layers/reduction_layer.cu b/src/caffe/layers/reduction_layer.cu deleted file mode 100644 index 2dbd3bc9..00000000 --- a/src/caffe/layers/reduction_layer.cu +++ /dev/null @@ -1,93 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void ReductionLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* mult_data = NULL; - if (sum_multiplier_.count() > 0) { - mult_data = sum_multiplier_.gpu_data(); - } - Dtype* top_data = top[0]->mutable_cpu_data(); - for (int i = 0; i < num_; ++i) { - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_dot(dim_, mult_data, bottom_data, top_data); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_asum(dim_, bottom_data, top_data); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_dot(dim_, bottom_data, bottom_data, top_data); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - ++top_data; - } - if (coeff_ != Dtype(1)) { - // Reset the top_data pointer. - top_data = top[0]->mutable_gpu_data(); - caffe_gpu_scal(num_, coeff_, top_data); - } -} - -template -void ReductionLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - // Get bottom_data, if needed. - const Dtype* bottom_data = NULL; - switch (op_) { - // Operations that don't need bottom_data - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - break; - // Operations that need bottom_data - case ReductionParameter_ReductionOp_ASUM: - case ReductionParameter_ReductionOp_SUMSQ: - bottom_data = bottom[0]->gpu_data(); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - const Dtype* top_diff = top[0]->cpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - for (int i = 0; i < num_; ++i) { - const Dtype bottom_coeff = (*top_diff) * coeff_; - switch (op_) { - case ReductionParameter_ReductionOp_SUM: - case ReductionParameter_ReductionOp_MEAN: - caffe_gpu_set(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_ASUM: - caffe_gpu_sign(dim_, bottom_data, bottom_diff); - caffe_gpu_scal(dim_, bottom_coeff, bottom_diff); - break; - case ReductionParameter_ReductionOp_SUMSQ: - caffe_gpu_scale(dim_, 2 * bottom_coeff, bottom_data, bottom_diff); - break; - default: - LOG(FATAL) << "Unknown reduction op: " - << ReductionParameter_ReductionOp_Name(op_); - } - bottom_data += dim_; - bottom_diff += dim_; - ++top_diff; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(ReductionLayer); - -} // namespace caffe diff --git a/src/caffe/layers/relu_layer.cpp b/src/caffe/layers/relu_layer.cpp index cc00319a..3d2eaf2e 100644 --- a/src/caffe/layers/relu_layer.cpp +++ b/src/caffe/layers/relu_layer.cpp @@ -5,7 +5,6 @@ #include "caffe/vision_layers.hpp" namespace caffe { - template void ReLULayer::Forward_cpu(const vector*>& bottom, const vector*>& top) { @@ -21,8 +20,7 @@ void ReLULayer::Forward_cpu(const vector*>& bottom, template void ReLULayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -30,17 +28,42 @@ void ReLULayer::Backward_cpu(const vector*>& top, const int count = bottom[0]->count(); Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); for (int i = 0; i < count; ++i) { - bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0) - + negative_slope * (bottom_data[i] <= 0)); + bottom_diff[i] = top_diff[i] + * ((bottom_data[i] > 0) + negative_slope * (bottom_data[i] <= 0)); } } } +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void ReLULayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + ReLUForward(count, bottom_data, top_data, negative_slope); +} + +template +void ReLULayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); + ReLUBackward(count, top_diff, bottom_data, bottom_diff, negative_slope); + } +} +// end: code modified for OpenCL port -#ifdef CPU_ONLY +#else STUB_GPU(ReLULayer); #endif -INSTANTIATE_CLASS(ReLULayer); +INSTANTIATE_CLASS (ReLULayer); } // namespace caffe diff --git a/src/caffe/layers/relu_layer.cu b/src/caffe/layers/relu_layer.cu deleted file mode 100644 index b8924c85..00000000 --- a/src/caffe/layers/relu_layer.cu +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void ReLUForward(const int n, const Dtype* in, Dtype* out, - Dtype negative_slope) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope; - } -} - -template -void ReLULayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - // NOLINT_NEXT_LINE(whitespace/operators) - ReLUForward<<>>( - count, bottom_data, top_data, negative_slope); - CUDA_POST_KERNEL_CHECK; - // << " count: " << count << " bottom_data: " - // << (unsigned long)bottom_data - // << " top_data: " << (unsigned long)top_data - // << " blocks: " << CAFFE_GET_BLOCKS(count) - // << " threads: " << CAFFE_CUDA_NUM_THREADS; -} - -template -__global__ void ReLUBackward(const int n, const Dtype* in_diff, - const Dtype* in_data, Dtype* out_diff, Dtype negative_slope) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = in_diff[index] * ((in_data[index] > 0) - + (in_data[index] <= 0) * negative_slope); - } -} - -template -void ReLULayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - Dtype negative_slope = this->layer_param_.relu_param().negative_slope(); - // NOLINT_NEXT_LINE(whitespace/operators) - ReLUBackward<<>>( - count, top_diff, bottom_data, bottom_diff, negative_slope); - CUDA_POST_KERNEL_CHECK; - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(ReLULayer); - - -} // namespace caffe diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp index ffe970f2..a2377d87 100644 --- a/src/caffe/layers/reshape_layer.cpp +++ b/src/caffe/layers/reshape_layer.cpp @@ -31,8 +31,9 @@ template void ReshapeLayer::Reshape(const vector*>& bottom, const vector*>& top) { const int input_start_axis = this->layer_param_.reshape_param().axis(); - const int start_axis = (input_start_axis >= 0) ? input_start_axis : - bottom[0]->num_axes() + input_start_axis + 1; + const int start_axis = + (input_start_axis >= 0) ? + input_start_axis : bottom[0]->num_axes() + input_start_axis + 1; CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range"; CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis << " out of range for " << bottom[0]->num_axes() << "-D input blob"; @@ -63,8 +64,8 @@ void ReshapeLayer::Reshape(const vector*>& bottom, CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index) << "new shape contains a 0, but there was no corresponding bottom axis " << "to copy"; - top_shape[start_axis + copy_axis_index] = - bottom[0]->shape(start_axis + copy_axis_index); + top_shape[start_axis + copy_axis_index] = bottom[0]->shape( + start_axis + copy_axis_index); } if (inferred_axis_ >= 0) { // A -1 dim was specified; infer the correct dimension by computing the @@ -89,7 +90,7 @@ void ReshapeLayer::Reshape(const vector*>& bottom, top[0]->ShareDiff(*bottom[0]); } -INSTANTIATE_CLASS(ReshapeLayer); -REGISTER_LAYER_CLASS(Reshape); +INSTANTIATE_CLASS (ReshapeLayer); +REGISTER_LAYER_CLASS (Reshape); } // namespace caffe diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp index cc236fe1..f074ac51 100644 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp +++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp @@ -11,7 +11,7 @@ namespace caffe { template void SigmoidCrossEntropyLossLayer::LayerSetUp( const vector*>& bottom, const vector*>& top) { - LossLayer::LayerSetUp(bottom, top); + LossLayer < Dtype > ::LayerSetUp(bottom, top); sigmoid_bottom_vec_.clear(); sigmoid_bottom_vec_.push_back(bottom[0]); sigmoid_top_vec_.clear(); @@ -22,9 +22,9 @@ void SigmoidCrossEntropyLossLayer::LayerSetUp( template void SigmoidCrossEntropyLossLayer::Reshape( const vector*>& bottom, const vector*>& top) { - LossLayer::Reshape(bottom, top); - CHECK_EQ(bottom[0]->count(), bottom[1]->count()) << - "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count."; + LossLayer < Dtype > ::Reshape(bottom, top); + CHECK_EQ(bottom[0]->count(), bottom[1]->count()) + << "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count."; sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_); } @@ -42,8 +42,9 @@ void SigmoidCrossEntropyLossLayer::Forward_cpu( const Dtype* target = bottom[1]->cpu_data(); Dtype loss = 0; for (int i = 0; i < count; ++i) { - loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) - - log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); + loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) + - log( + 1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0))); } top[0]->mutable_cpu_data()[0] = loss / num; } @@ -54,7 +55,7 @@ void SigmoidCrossEntropyLossLayer::Backward_cpu( const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { // First, compute the diff @@ -70,11 +71,35 @@ void SigmoidCrossEntropyLossLayer::Backward_cpu( } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +template +void SigmoidCrossEntropyLossLayer::Backward_gpu( + const vector*>& top, const vector& propagate_down, + const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + // First, compute the diff + const int count = bottom[0]->count(); + const int num = bottom[0]->num(); + const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); + const Dtype* target = bottom[1]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_copy(count, sigmoid_output_data, bottom_diff); + caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); + // Scale down gradient + const Dtype loss_weight = top[0]->cpu_diff()[0]; + caffe_gpu_scal(count, loss_weight / num, bottom_diff); + } +} + +#else STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward); #endif -INSTANTIATE_CLASS(SigmoidCrossEntropyLossLayer); -REGISTER_LAYER_CLASS(SigmoidCrossEntropyLoss); +INSTANTIATE_CLASS (SigmoidCrossEntropyLossLayer); +REGISTER_LAYER_CLASS (SigmoidCrossEntropyLoss); } // namespace caffe diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu deleted file mode 100644 index 547fa80c..00000000 --- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void SigmoidCrossEntropyLossLayer::Backward_gpu( - const vector*>& top, const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - // First, compute the diff - const int count = bottom[0]->count(); - const int num = bottom[0]->num(); - const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data(); - const Dtype* target = bottom[1]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_copy(count, sigmoid_output_data, bottom_diff); - caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff); - // Scale down gradient - const Dtype loss_weight = top[0]->cpu_diff()[0]; - caffe_gpu_scal(count, loss_weight / num, bottom_diff); - } -} - -INSTANTIATE_LAYER_GPU_BACKWARD(SigmoidCrossEntropyLossLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp index 48c38490..b820e8ff 100644 --- a/src/caffe/layers/sigmoid_layer.cpp +++ b/src/caffe/layers/sigmoid_layer.cpp @@ -4,6 +4,7 @@ #include "caffe/layer.hpp" #include "caffe/vision_layers.hpp" +#include "caffe/util/ocl_wrapper.hpp" namespace caffe { @@ -25,8 +26,7 @@ void SigmoidLayer::Forward_cpu(const vector*>& bottom, template void SigmoidLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[0]) { const Dtype* top_data = top[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -39,11 +39,37 @@ void SigmoidLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port + +template +void SigmoidLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + SigmoidForward(count, bottom_data, top_data); +} + +template +void SigmoidLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + SigmoidBackward(count, top_diff, top_data, bottom_diff); + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(SigmoidLayer); #endif -INSTANTIATE_CLASS(SigmoidLayer); - +INSTANTIATE_CLASS (SigmoidLayer); } // namespace caffe diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/sigmoid_layer.cu deleted file mode 100644 index e1af0657..00000000 --- a/src/caffe/layers/sigmoid_layer.cu +++ /dev/null @@ -1,62 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = 1. / (1. + exp(-in[index])); - } -} - -template -void SigmoidLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - SigmoidForward<<>>( - count, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; - // << " count: " << count << " bottom_data: " - // << (unsigned long)bottom_data - // << " top_data: " << (unsigned long)top_data - // << " blocks: " << CAFFE_GET_BLOCKS(count) - // << " threads: " << CAFFE_CUDA_NUM_THREADS; -} - -template -__global__ void SigmoidBackward(const int n, const Dtype* in_diff, - const Dtype* out_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - const Dtype sigmoid_x = out_data[index]; - out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); - } -} - -template -void SigmoidLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - SigmoidBackward<<>>( - count, top_diff, top_data, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SigmoidLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/silence_layer.cpp b/src/caffe/layers/silence_layer.cpp index 4abf9eff..4436584b 100644 --- a/src/caffe/layers/silence_layer.cpp +++ b/src/caffe/layers/silence_layer.cpp @@ -8,20 +8,39 @@ namespace caffe { template void SilenceLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { for (int i = 0; i < bottom.size(); ++i) { if (propagate_down[i]) { - caffe_set(bottom[i]->count(), Dtype(0), - bottom[i]->mutable_cpu_data()); + caffe_set(bottom[i]->count(), Dtype(0), bottom[i]->mutable_cpu_data()); } } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void SilenceLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + // Do nothing. +} + +template +void SilenceLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + for (int i = 0; i < bottom.size(); ++i) { + if (propagate_down[i]) { + caffe_gpu_set(bottom[i]->count(), Dtype(0), + bottom[i]->mutable_gpu_data()); + } + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(SilenceLayer); #endif -INSTANTIATE_CLASS(SilenceLayer); -REGISTER_LAYER_CLASS(Silence); +INSTANTIATE_CLASS (SilenceLayer); +REGISTER_LAYER_CLASS (Silence); } // namespace caffe diff --git a/src/caffe/layers/silence_layer.cu b/src/caffe/layers/silence_layer.cu deleted file mode 100644 index 8d044ee7..00000000 --- a/src/caffe/layers/silence_layer.cu +++ /dev/null @@ -1,28 +0,0 @@ -#include - -#include "caffe/common_layers.hpp" -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" - -namespace caffe { - -template -void SilenceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - // Do nothing. -} - -template -void SilenceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - for (int i = 0; i < bottom.size(); ++i) { - if (propagate_down[i]) { - caffe_gpu_set(bottom[i]->count(), Dtype(0), - bottom[i]->mutable_gpu_data()); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SilenceLayer); - -} // namespace caffe diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp index e4418c9c..de21e936 100644 --- a/src/caffe/layers/slice_layer.cpp +++ b/src/caffe/layers/slice_layer.cpp @@ -9,19 +9,18 @@ namespace caffe { template void SliceLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const SliceParameter& slice_param = this->layer_param_.slice_param(); CHECK(!(slice_param.has_axis() && slice_param.has_slice_dim())) << "Either axis or slice_dim should be specified; not both."; slice_point_.clear(); - std::copy(slice_param.slice_point().begin(), - slice_param.slice_point().end(), + std::copy(slice_param.slice_point().begin(), slice_param.slice_point().end(), std::back_inserter(slice_point_)); } template void SliceLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { const int num_axes = bottom[0]->num_axes(); const SliceParameter& slice_param = this->layer_param_.slice_param(); if (slice_param.has_slice_dim()) { @@ -57,9 +56,9 @@ void SliceLayer::Reshape(const vector*>& bottom, count += top[i]->count(); } } else { - CHECK_EQ(bottom_slice_axis % top.size(), 0) - << "Number of top blobs (" << top.size() << ") should evenly " - << "divide input slice axis (" << bottom_slice_axis << ")"; + CHECK_EQ(bottom_slice_axis % top.size(), 0) << "Number of top blobs (" + << top.size() << ") should evenly " << "divide input slice axis (" + << bottom_slice_axis << ")"; top_shape[slice_axis_] = bottom_slice_axis / top.size(); for (int i = 0; i < top.size(); ++i) { top[i]->Reshape(top_shape); @@ -71,7 +70,7 @@ void SliceLayer::Reshape(const vector*>& bottom, template void SliceLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { int offset_slice_axis = 0; const Dtype* bottom_data = bottom[0]->cpu_data(); const int bottom_slice_axis = bottom[0]->shape(slice_axis_); @@ -80,10 +79,10 @@ void SliceLayer::Forward_cpu(const vector*>& bottom, const int top_slice_axis = top[i]->shape(slice_axis_); for (int n = 0; n < num_slices_; ++n) { const int top_offset = n * top_slice_axis * slice_size_; - const int bottom_offset = - (n * bottom_slice_axis + offset_slice_axis) * slice_size_; - caffe_copy(top_slice_axis * slice_size_, - bottom_data + bottom_offset, top_data + top_offset); + const int bottom_offset = (n * bottom_slice_axis + offset_slice_axis) + * slice_size_; + caffe_copy(top_slice_axis * slice_size_, bottom_data + bottom_offset, + top_data + top_offset); } offset_slice_axis += top_slice_axis; } @@ -91,8 +90,10 @@ void SliceLayer::Forward_cpu(const vector*>& bottom, template void SliceLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } int offset_slice_axis = 0; Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); const int bottom_slice_axis = bottom[0]->shape(slice_axis_); @@ -101,20 +102,63 @@ void SliceLayer::Backward_cpu(const vector*>& top, const int top_slice_axis = top[i]->shape(slice_axis_); for (int n = 0; n < num_slices_; ++n) { const int top_offset = n * top_slice_axis * slice_size_; - const int bottom_offset = - (n * bottom_slice_axis + offset_slice_axis) * slice_size_; - caffe_copy(top_slice_axis * slice_size_, - top_diff + top_offset, bottom_diff + bottom_offset); + const int bottom_offset = (n * bottom_slice_axis + offset_slice_axis) + * slice_size_; + caffe_copy(top_slice_axis * slice_size_, top_diff + top_offset, + bottom_diff + bottom_offset); } offset_slice_axis += top_slice_axis; } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void SliceLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + if (top.size() == 1) { return; } + int offset_slice_axis = 0; + const Dtype* bottom_data = bottom[0]->gpu_data(); + const int bottom_slice_axis = bottom[0]->shape(slice_axis_); + const bool kForward = true; + for (int i = 0; i < top.size(); ++i) { + Dtype* top_data = top[i]->mutable_gpu_data(); + const int top_slice_axis = top[i]->shape(slice_axis_); + const int top_slice_size = top_slice_axis * slice_size_; + const int nthreads = top_slice_size * num_slices_; + Slice // NOLINT_NEXT_LINE(whitespace/operators) + (nthreads, bottom_data, kForward, num_slices_, slice_size_, + bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data); + offset_slice_axis += top_slice_axis; + } +} + +template +void SliceLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0] || top.size() == 1) { return; } + int offset_slice_axis = 0; + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int bottom_slice_axis = bottom[0]->shape(slice_axis_); + const bool kForward = false; + for (int i = 0; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + const int top_slice_axis = top[i]->shape(slice_axis_); + const int top_slice_size = top_slice_axis * slice_size_; + const int nthreads = top_slice_size * num_slices_; + Slice // NOLINT_NEXT_LINE(whitespace/operators) + (nthreads, top_diff, kForward, num_slices_, slice_size_, + bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff); + offset_slice_axis += top_slice_axis; + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(SliceLayer); #endif -INSTANTIATE_CLASS(SliceLayer); -REGISTER_LAYER_CLASS(Slice); +INSTANTIATE_CLASS (SliceLayer); +REGISTER_LAYER_CLASS (Slice); } // namespace caffe diff --git a/src/caffe/layers/slice_layer.cu b/src/caffe/layers/slice_layer.cu deleted file mode 100644 index 796841d3..00000000 --- a/src/caffe/layers/slice_layer.cu +++ /dev/null @@ -1,71 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void Slice(const int nthreads, const Dtype* in_data, - const bool forward, const int num_slices, const int slice_size, - const int bottom_slice_axis, const int top_slice_axis, - const int offset_slice_axis, Dtype* out_data) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int total_slice_size = slice_size * top_slice_axis; - const int slice_num = index / total_slice_size; - const int slice_index = index % total_slice_size; - const int bottom_index = slice_index + - (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size; - if (forward) { - out_data[index] = in_data[bottom_index]; - } else { - out_data[bottom_index] = in_data[index]; - } - } -} - -template -void SliceLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - int offset_slice_axis = 0; - const Dtype* bottom_data = bottom[0]->gpu_data(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - const bool kForward = true; - for (int i = 0; i < top.size(); ++i) { - Dtype* top_data = top[i]->mutable_gpu_data(); - const int top_slice_axis = top[i]->shape(slice_axis_); - const int top_slice_size = top_slice_axis * slice_size_; - const int nthreads = top_slice_size * num_slices_; - Slice // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, bottom_data, kForward, num_slices_, slice_size_, - bottom_slice_axis, top_slice_axis, offset_slice_axis, top_data); - offset_slice_axis += top_slice_axis; - } -} - -template -void SliceLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - int offset_slice_axis = 0; - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int bottom_slice_axis = bottom[0]->shape(slice_axis_); - const bool kForward = false; - for (int i = 0; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - const int top_slice_axis = top[i]->shape(slice_axis_); - const int top_slice_size = top_slice_axis * slice_size_; - const int nthreads = top_slice_size * num_slices_; - Slice // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - nthreads, top_diff, kForward, num_slices_, slice_size_, - bottom_slice_axis, top_slice_axis, offset_slice_axis, bottom_diff); - offset_slice_axis += top_slice_axis; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SliceLayer); - -} // namespace caffe diff --git a/src/caffe/layers/softmax_layer.cpp b/src/caffe/layers/softmax_layer.cpp index 04712c9e..1269b058 100644 --- a/src/caffe/layers/softmax_layer.cpp +++ b/src/caffe/layers/softmax_layer.cpp @@ -9,9 +9,9 @@ namespace caffe { template void SoftmaxLayer::Reshape(const vector*>& bottom, - const vector*>& top) { - softmax_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); + const vector*>& top) { + softmax_axis_ = bottom[0]->CanonicalAxisIndex( + this->layer_param_.softmax_param().axis()); top[0]->ReshapeLike(*bottom[0]); vector mult_dims(1, bottom[0]->shape(softmax_axis_)); sum_multiplier_.Reshape(mult_dims); @@ -24,6 +24,10 @@ void SoftmaxLayer::Reshape(const vector*>& bottom, scale_.Reshape(scale_dims); } +template +SoftmaxLayer::~SoftmaxLayer() { +} + template void SoftmaxLayer::Forward_cpu(const vector*>& bottom, const vector*>& top) { @@ -45,13 +49,13 @@ void SoftmaxLayer::Forward_cpu(const vector*>& bottom, } } // subtraction - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, channels, inner_num_, - 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., top_data); // exponentiation - caffe_exp(dim, top_data, top_data); + caffe_exp < Dtype > (dim, top_data, top_data); // sum after exp - caffe_cpu_gemv(CblasTrans, channels, inner_num_, 1., - top_data, sum_multiplier_.cpu_data(), 0., scale_data); + caffe_cpu_gemv < Dtype + > (CblasTrans, channels, inner_num_, 1., top_data, sum_multiplier_.cpu_data(), 0., scale_data); // division for (int j = 0; j < channels; j++) { caffe_div(inner_num_, top_data, scale_data, top_data); @@ -62,8 +66,7 @@ void SoftmaxLayer::Forward_cpu(const vector*>& bottom, template void SoftmaxLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* top_data = top[0]->cpu_data(); Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); @@ -74,23 +77,82 @@ void SoftmaxLayer::Backward_cpu(const vector*>& top, for (int i = 0; i < outer_num_; ++i) { // compute dot(top_diff, top_data) and subtract them from the bottom diff for (int k = 0; k < inner_num_; ++k) { - scale_data[k] = caffe_cpu_strided_dot(channels, - bottom_diff + i * dim + k, inner_num_, - top_data + i * dim + k, inner_num_); + scale_data[k] = caffe_cpu_strided_dot < Dtype + > (channels, bottom_diff + i * dim + k, inner_num_, top_data + i * dim + + k, inner_num_); } // subtraction - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, - -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + i * dim); + caffe_cpu_gemm < Dtype + > (CblasNoTrans, CblasNoTrans, channels, inner_num_, 1, -1., sum_multiplier_.cpu_data(), scale_data, 1., bottom_diff + + i * dim); } // elementwise multiplication caffe_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); } +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void SoftmaxLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + Dtype* scale_data = scale_.mutable_gpu_data(); + int count = bottom[0]->count(); + int channels = top[0]->shape(softmax_axis_); + + caffe_gpu_copy(count, bottom_data, top_data); + // We need to subtract the max to avoid numerical issues, compute the exp, + // and then normalize. + // compute max + // NOLINT_NEXT_LINE(whitespace/operators) + + kernel_channel_max < Dtype + > (outer_num_, channels, inner_num_, top_data, scale_data); + // subtract + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_subtract < Dtype + > (count, outer_num_, channels, inner_num_, scale_data, top_data); + // exponentiate + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_exp < Dtype > (count, top_data, top_data); + // sum after exp + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_sum < Dtype + > (outer_num_, channels, inner_num_, top_data, scale_data); + // divide + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_div < Dtype + > (count, outer_num_, channels, inner_num_, scale_data, top_data); +} + +template +void SoftmaxLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* top_data = top[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + Dtype* scale_data = scale_.mutable_gpu_data(); + int count = top[0]->count(); + int channels = top[0]->shape(softmax_axis_); + caffe_gpu_copy(count, top_diff, bottom_diff); + // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff. + // NOLINT_NEXT_LINE(whitespace/operators) + + kernel_channel_dot < Dtype + > (outer_num_, channels, inner_num_, top_diff, top_data, scale_data); + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_channel_subtract < Dtype + > (count, outer_num_, channels, inner_num_, scale_data, bottom_diff); + // elementwise multiplication + caffe_gpu_mul < Dtype > (top[0]->count(), bottom_diff, top_data, bottom_diff); -#ifdef CPU_ONLY +} +// end: code modified for OpenCL port +#else STUB_GPU(SoftmaxLayer); #endif -INSTANTIATE_CLASS(SoftmaxLayer); +INSTANTIATE_CLASS (SoftmaxLayer); } // namespace caffe diff --git a/src/caffe/layers/softmax_layer.cu b/src/caffe/layers/softmax_layer.cu deleted file mode 100644 index 1f9c3a41..00000000 --- a/src/caffe/layers/softmax_layer.cu +++ /dev/null @@ -1,149 +0,0 @@ -#include -#include -#include - -#include "thrust/device_vector.h" - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void kernel_channel_max(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* out) { - CUDA_KERNEL_LOOP(index, num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - Dtype maxval = -FLT_MAX; - for (int c = 0; c < channels; ++c) { - maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); - } - out[index] = maxval; - } -} - -template -__global__ void kernel_channel_subtract(const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_max, Dtype* data) { - CUDA_KERNEL_LOOP(index, count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] -= channel_max[n * spatial_dim + s]; - } -} - -template -__global__ void kernel_exp(const int count, const Dtype* data, Dtype* out) { - CUDA_KERNEL_LOOP(index, count) { - out[index] = exp(data[index]); - } -} - -template -__global__ void kernel_channel_sum(const int num, const int channels, - const int spatial_dim, const Dtype* data, Dtype* channel_sum) { - CUDA_KERNEL_LOOP(index, num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - Dtype sum = 0; - for (int c = 0; c < channels; ++c) { - sum += data[(n * channels + c) * spatial_dim + s]; - } - channel_sum[index] = sum; - } -} - -template -__global__ void kernel_channel_div(const int count, - const int num, const int channels, - const int spatial_dim, const Dtype* channel_sum, Dtype* data) { - CUDA_KERNEL_LOOP(index, count) { - int n = index / channels / spatial_dim; - int s = index % spatial_dim; - data[index] /= channel_sum[n * spatial_dim + s]; - } -} - -template -__global__ void kernel_channel_dot(const int num, const int channels, - const int spatial_dim, const Dtype* data_1, const Dtype* data_2, - Dtype* channel_dot) { - CUDA_KERNEL_LOOP(index, num * spatial_dim) { - int n = index / spatial_dim; - int s = index % spatial_dim; - Dtype dot = 0; - for (int c = 0; c < channels; ++c) { - dot += (data_1[(n * channels + c) * spatial_dim + s] - * data_2[(n * channels + c) * spatial_dim + s]); - } - channel_dot[index] = dot; - } -} - -template -void SoftmaxLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - Dtype* scale_data = scale_.mutable_gpu_data(); - int count = bottom[0]->count(); - int channels = top[0]->shape(softmax_axis_); - caffe_copy(count, bottom_data, top_data); - // We need to subtract the max to avoid numerical issues, compute the exp, - // and then normalize. - // compute max - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_max<<>>(outer_num_, channels, inner_num_, top_data, - scale_data); - // subtract - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract<<>>(count, outer_num_, channels, inner_num_, - scale_data, top_data); - // exponentiate - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_exp<<>>( - count, top_data, top_data); - // sum after exp - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_sum<<>>(outer_num_, channels, inner_num_, top_data, - scale_data); - // divide - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_div<<>>(count, outer_num_, channels, inner_num_, - scale_data, top_data); -} - -template -void SoftmaxLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - const Dtype* top_diff = top[0]->gpu_diff(); - const Dtype* top_data = top[0]->gpu_data(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - Dtype* scale_data = scale_.mutable_gpu_data(); - int count = top[0]->count(); - int channels = top[0]->shape(softmax_axis_); - caffe_copy(count, top_diff, bottom_diff); - // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff. - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_dot<<>>(outer_num_, channels, inner_num_, - top_diff, top_data, scale_data); - // NOLINT_NEXT_LINE(whitespace/operators) - kernel_channel_subtract<<>>(count, outer_num_, channels, inner_num_, - scale_data, bottom_diff); - // elementwise multiplication - caffe_gpu_mul(top[0]->count(), bottom_diff, top_data, bottom_diff); -} - -INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp index ba312f67..ef03ec7e 100644 --- a/src/caffe/layers/softmax_loss_layer.cpp +++ b/src/caffe/layers/softmax_loss_layer.cpp @@ -10,20 +10,19 @@ namespace caffe { template -void SoftmaxWithLossLayer::LayerSetUp( - const vector*>& bottom, const vector*>& top) { - LossLayer::LayerSetUp(bottom, top); +void SoftmaxWithLossLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::LayerSetUp(bottom, top); LayerParameter softmax_param(this->layer_param_); softmax_param.set_type("Softmax"); - softmax_layer_ = LayerRegistry::CreateLayer(softmax_param); + softmax_layer_ = LayerRegistry < Dtype > ::CreateLayer(softmax_param); softmax_bottom_vec_.clear(); softmax_bottom_vec_.push_back(bottom[0]); softmax_top_vec_.clear(); softmax_top_vec_.push_back(&prob_); softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_); - has_ignore_label_ = - this->layer_param_.loss_param().has_ignore_label(); + has_ignore_label_ = this->layer_param_.loss_param().has_ignore_label(); if (has_ignore_label_) { ignore_label_ = this->layer_param_.loss_param().ignore_label(); } @@ -31,12 +30,16 @@ void SoftmaxWithLossLayer::LayerSetUp( } template -void SoftmaxWithLossLayer::Reshape( - const vector*>& bottom, const vector*>& top) { - LossLayer::Reshape(bottom, top); +SoftmaxWithLossLayer::~SoftmaxWithLossLayer() { +} + +template +void SoftmaxWithLossLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + LossLayer < Dtype > ::Reshape(bottom, top); softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_); - softmax_axis_ = - bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); + softmax_axis_ = bottom[0]->CanonicalAxisIndex( + this->layer_param_.softmax_param().axis()); outer_num_ = bottom[0]->count(0, softmax_axis_); inner_num_ = bottom[0]->count(softmax_axis_ + 1); CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) @@ -68,8 +71,9 @@ void SoftmaxWithLossLayer::Forward_cpu( } DCHECK_GE(label_value, 0); DCHECK_LT(label_value, prob_.shape(softmax_axis_)); - loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j], - Dtype(FLT_MIN))); + loss -= log( + std::max(prob_data[i * dim + label_value * inner_num_ + j], + Dtype(FLT_MIN))); ++count; } } @@ -88,7 +92,7 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; + << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); @@ -120,11 +124,79 @@ void SoftmaxWithLossLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +// begin: code modified for OpenCL port +#ifndef CPU_ONLY +template +void SoftmaxWithLossLayer::Forward_gpu( + const vector*>& bottom, const vector*>& top) { + softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); + const Dtype* prob_data = prob_.gpu_data(); + const Dtype* label = bottom[1]->gpu_data(); + const int dim = prob_.count() / outer_num_; + const int nthreads = outer_num_ * inner_num_; + // Since this memory is not used for anything until it is overwritten + // on the backward pass, we use it here to avoid having to allocate new GPU + // memory to accumulate intermediate results in the kernel. + Dtype* loss_data = bottom[0]->mutable_gpu_diff(); + // Similarly, this memory is never used elsewhere, and thus we can use it + // to avoid having to allocate additional GPU memory. + Dtype* counts = prob_.mutable_gpu_diff(); + // NOLINT_NEXT_LINE(whitespace/operators) + SoftmaxLossForwardGPU < Dtype + > (nthreads, prob_data, label, loss_data, outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); + Dtype loss; + caffe_gpu_asum(nthreads, loss_data, &loss); + if (normalize_) { + Dtype count; + caffe_gpu_asum(nthreads, counts, &count); + loss /= count; + } else { + loss /= outer_num_; + } + top[0]->mutable_cpu_data()[0] = loss; + if (top.size() == 2) { + top[1]->ShareData(prob_); + } +} + +template +void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[1]) { + LOG(FATAL) << this->type() + << " Layer cannot backpropagate to label inputs."; + } + if (propagate_down[0]) { + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const Dtype* prob_data = prob_.gpu_data(); + const Dtype* top_data = top[0]->gpu_data(); + caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); + //caffe_gpu_copy(prob_.count(), prob_data, bottom_diff); + const Dtype* label = bottom[1]->gpu_data(); + const int dim = prob_.count() / outer_num_; + const int nthreads = outer_num_ * inner_num_; + // Since this memory is never used for anything else, + // we use to to avoid allocating new GPU memory. + Dtype* counts = prob_.mutable_gpu_diff(); + // NOLINT_NEXT_LINE(whitespace/operators) + SoftmaxLossBackwardGPU < Dtype + > (nthreads, top_data, label, bottom_diff, outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); + const Dtype loss_weight = top[0]->cpu_diff()[0]; + if (normalize_) { + Dtype count; + caffe_gpu_asum(nthreads, counts, &count); + caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff); + } else { + caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); + } + } +} +// end: code modified for OpenCL port +#else STUB_GPU(SoftmaxWithLossLayer); #endif -INSTANTIATE_CLASS(SoftmaxWithLossLayer); -REGISTER_LAYER_CLASS(SoftmaxWithLoss); +INSTANTIATE_CLASS (SoftmaxWithLossLayer); +REGISTER_LAYER_CLASS (SoftmaxWithLoss); } // namespace caffe diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu deleted file mode 100644 index 7e0f3da4..00000000 --- a/src/caffe/layers/softmax_loss_layer.cu +++ /dev/null @@ -1,125 +0,0 @@ -#include -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void SoftmaxLossForwardGPU(const int nthreads, - const Dtype* prob_data, const Dtype* label, Dtype* loss, - const int num, const int dim, const int spatial_dim, - const bool has_ignore_label_, const int ignore_label_, - Dtype* counts) { - CUDA_KERNEL_LOOP(index, nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); - if (has_ignore_label_ && label_value == ignore_label_) { - loss[index] = 0; - counts[index] = 0; - } else { - loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], - Dtype(FLT_MIN))); - counts[index] = 1; - } - } -} - -template -void SoftmaxWithLossLayer::Forward_gpu( - const vector*>& bottom, const vector*>& top) { - softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); - const Dtype* prob_data = prob_.gpu_data(); - const Dtype* label = bottom[1]->gpu_data(); - const int dim = prob_.count() / outer_num_; - const int nthreads = outer_num_ * inner_num_; - // Since this memory is not used for anything until it is overwritten - // on the backward pass, we use it here to avoid having to allocate new GPU - // memory to accumulate intermediate results in the kernel. - Dtype* loss_data = bottom[0]->mutable_gpu_diff(); - // Similarly, this memory is never used elsewhere, and thus we can use it - // to avoid having to allocate additional GPU memory. - Dtype* counts = prob_.mutable_gpu_diff(); - // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossForwardGPU<<>>(nthreads, prob_data, label, loss_data, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); - Dtype loss; - caffe_gpu_asum(nthreads, loss_data, &loss); - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - loss /= count; - } else { - loss /= outer_num_; - } - top[0]->mutable_cpu_data()[0] = loss; - if (top.size() == 2) { - top[1]->ShareData(prob_); - } -} - -template -__global__ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, - const Dtype* label, Dtype* bottom_diff, const int num, const int dim, - const int spatial_dim, const bool has_ignore_label_, - const int ignore_label_, Dtype* counts) { - const int channels = dim / spatial_dim; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int n = index / spatial_dim; - const int s = index % spatial_dim; - const int label_value = static_cast(label[n * spatial_dim + s]); - - if (has_ignore_label_ && label_value == ignore_label_) { - for (int c = 0; c < channels; ++c) { - bottom_diff[n * dim + c * spatial_dim + s] = 0; - } - counts[index] = 0; - } else { - bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; - counts[index] = 1; - } - } -} - -template -void SoftmaxWithLossLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (propagate_down[1]) { - LOG(FATAL) << this->type() - << " Layer cannot backpropagate to label inputs."; - } - if (propagate_down[0]) { - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const Dtype* prob_data = prob_.gpu_data(); - const Dtype* top_data = top[0]->gpu_data(); - caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff); - const Dtype* label = bottom[1]->gpu_data(); - const int dim = prob_.count() / outer_num_; - const int nthreads = outer_num_ * inner_num_; - // Since this memory is never used for anything else, - // we use to to avoid allocating new GPU memory. - Dtype* counts = prob_.mutable_gpu_diff(); - // NOLINT_NEXT_LINE(whitespace/operators) - SoftmaxLossBackwardGPU<<>>(nthreads, top_data, label, bottom_diff, - outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts); - const Dtype loss_weight = top[0]->cpu_diff()[0]; - if (normalize_) { - Dtype count; - caffe_gpu_asum(nthreads, counts, &count); - caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff); - } else { - caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff); - } - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxWithLossLayer); - -} // namespace caffe diff --git a/src/caffe/layers/split_layer.cpp b/src/caffe/layers/split_layer.cpp index 272cb59c..e92f7bf2 100644 --- a/src/caffe/layers/split_layer.cpp +++ b/src/caffe/layers/split_layer.cpp @@ -8,7 +8,7 @@ namespace caffe { template void SplitLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { count_ = bottom[0]->count(); for (int i = 0; i < top.size(); ++i) { // Do not allow in-place computation in the SplitLayer. Instead, share data @@ -25,7 +25,7 @@ void SplitLayer::Reshape(const vector*>& bottom, template void SplitLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { for (int i = 0; i < top.size(); ++i) { top[i]->ShareData(*bottom[0]); } @@ -33,14 +33,16 @@ void SplitLayer::Forward_cpu(const vector*>& bottom, template void SplitLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } if (top.size() == 1) { caffe_copy(count_, top[0]->cpu_diff(), bottom[0]->mutable_cpu_diff()); return; } caffe_add(count_, top[0]->cpu_diff(), top[1]->cpu_diff(), - bottom[0]->mutable_cpu_diff()); + bottom[0]->mutable_cpu_diff()); // Add remaining top blob diffs. for (int i = 2; i < top.size(); ++i) { const Dtype* top_diff = top[i]->cpu_diff(); @@ -49,12 +51,41 @@ void SplitLayer::Backward_cpu(const vector*>& top, } } +#ifndef CPU_ONLY +template +void SplitLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + for (int i = 0; i < top.size(); ++i) { + top[i]->ShareData(*bottom[0]); + } +} -#ifdef CPU_ONLY +// begin: code modified for OpenCL port +template +void SplitLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (!propagate_down[0]) { + return; + } + if (top.size() == 1) { + caffe_gpu_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); + return; + } + caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), + bottom[0]->mutable_gpu_diff()); + // Add remaining top blob diffs. + for (int i = 2; i < top.size(); ++i) { + const Dtype* top_diff = top[i]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); + } +} +// begin: code modified for OpenCL port +#else STUB_GPU(SplitLayer); #endif -INSTANTIATE_CLASS(SplitLayer); -REGISTER_LAYER_CLASS(Split); +INSTANTIATE_CLASS (SplitLayer); +REGISTER_LAYER_CLASS (Split); } // namespace caffe diff --git a/src/caffe/layers/split_layer.cu b/src/caffe/layers/split_layer.cu deleted file mode 100644 index a4f5df26..00000000 --- a/src/caffe/layers/split_layer.cu +++ /dev/null @@ -1,38 +0,0 @@ -#include - -#include "caffe/layer.hpp" -#include "caffe/util/math_functions.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -void SplitLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - for (int i = 0; i < top.size(); ++i) { - top[i]->ShareData(*bottom[0]); - } -} - -template -void SplitLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - if (!propagate_down[0]) { return; } - if (top.size() == 1) { - caffe_copy(count_, top[0]->gpu_diff(), bottom[0]->mutable_gpu_diff()); - return; - } - caffe_gpu_add(count_, top[0]->gpu_diff(), top[1]->gpu_diff(), - bottom[0]->mutable_gpu_diff()); - // Add remaining top blob diffs. - for (int i = 2; i < top.size(); ++i) { - const Dtype* top_diff = top[i]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - caffe_gpu_axpy(count_, Dtype(1.), top_diff, bottom_diff); - } -} - - -INSTANTIATE_LAYER_GPU_FUNCS(SplitLayer); - -} // namespace caffe diff --git a/src/caffe/layers/spp_layer.cpp b/src/caffe/layers/spp_layer.cpp index 795dd716..d552af61 100644 --- a/src/caffe/layers/spp_layer.cpp +++ b/src/caffe/layers/spp_layer.cpp @@ -15,7 +15,7 @@ using std::max; template LayerParameter SPPLayer::GetPoolingParam(const int pyramid_level, - const int bottom_h, const int bottom_w, const SPPParameter spp_param) { + const int bottom_h, const int bottom_w, const SPPParameter spp_param) { LayerParameter pooling_param; int num_bins = pow(2, pyramid_level); @@ -63,7 +63,7 @@ LayerParameter SPPLayer::GetPoolingParam(const int pyramid_level, template void SPPLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { SPPParameter spp_param = this->layer_param_.spp_param(); bottom_h_ = bottom[0]->height(); @@ -103,11 +103,12 @@ void SPPLayer::LayerSetUp(const vector*>& bottom, pooling_top_vecs_[i]->push_back(pooling_outputs_[i]); // pooling layer setup - LayerParameter pooling_param = GetPoolingParam( - i, bottom_h_, bottom_w_, spp_param); + LayerParameter pooling_param = GetPoolingParam(i, bottom_h_, bottom_w_, + spp_param); - pooling_layers_.push_back(shared_ptr > ( - new PoolingLayer(pooling_param))); + pooling_layers_.push_back( + shared_ptr < PoolingLayer + > (new PoolingLayer(pooling_param))); pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); // flatten layer output holders setup @@ -132,7 +133,7 @@ void SPPLayer::LayerSetUp(const vector*>& bottom, template void SPPLayer::Reshape(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, " << "corresponding to (num, channels, height, width)"; channels_ = bottom[0]->channels(); @@ -141,53 +142,48 @@ void SPPLayer::Reshape(const vector*>& bottom, SPPParameter spp_param = this->layer_param_.spp_param(); split_layer_->Reshape(bottom, split_top_vec_); for (int i = 0; i < pyramid_height_; i++) { - LayerParameter pooling_param = GetPoolingParam( - i, bottom_h_, bottom_w_, spp_param); - - pooling_layers_[i].reset( - new PoolingLayer(pooling_param)); - pooling_layers_[i]->SetUp( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); - pooling_layers_[i]->Reshape( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); - flatten_layers_[i]->Reshape( - *pooling_top_vecs_[i], *flatten_top_vecs_[i]); + LayerParameter pooling_param = GetPoolingParam(i, bottom_h_, bottom_w_, + spp_param); + + pooling_layers_[i].reset(new PoolingLayer(pooling_param)); + pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); + pooling_layers_[i]->Reshape(*pooling_bottom_vecs_[i], + *pooling_top_vecs_[i]); + flatten_layers_[i]->Reshape(*pooling_top_vecs_[i], *flatten_top_vecs_[i]); } concat_layer_->Reshape(concat_bottom_vec_, top); } template void SPPLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { split_layer_->Forward(bottom, split_top_vec_); for (int i = 0; i < pyramid_height_; i++) { - pooling_layers_[i]->Forward( - *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]); - flatten_layers_[i]->Forward( - *pooling_top_vecs_[i], *flatten_top_vecs_[i]); + pooling_layers_[i]->Forward(*pooling_bottom_vecs_[i], + *pooling_top_vecs_[i]); + flatten_layers_[i]->Forward(*pooling_top_vecs_[i], *flatten_top_vecs_[i]); } concat_layer_->Forward(concat_bottom_vec_, top); } template void SPPLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (!propagate_down[0]) { return; } vector concat_propagate_down(pyramid_height_, true); concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_); for (int i = 0; i < pyramid_height_; i++) { - flatten_layers_[i]->Backward( - *flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]); - pooling_layers_[i]->Backward( - *pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]); + flatten_layers_[i]->Backward(*flatten_top_vecs_[i], propagate_down, + *pooling_top_vecs_[i]); + pooling_layers_[i]->Backward(*pooling_top_vecs_[i], propagate_down, + *pooling_bottom_vecs_[i]); } split_layer_->Backward(split_top_vec_, propagate_down, bottom); } - -INSTANTIATE_CLASS(SPPLayer); -REGISTER_LAYER_CLASS(SPP); +INSTANTIATE_CLASS (SPPLayer); +REGISTER_LAYER_CLASS (SPP); } // namespace caffe diff --git a/src/caffe/layers/tanh_layer.cpp b/src/caffe/layers/tanh_layer.cpp index ee5ed773..f62092b2 100644 --- a/src/caffe/layers/tanh_layer.cpp +++ b/src/caffe/layers/tanh_layer.cpp @@ -6,6 +6,7 @@ #include "caffe/layer.hpp" #include "caffe/vision_layers.hpp" +#include "caffe/util/ocl_wrapper.hpp" namespace caffe { @@ -22,8 +23,7 @@ void TanHLayer::Forward_cpu(const vector*>& bottom, template void TanHLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { + const vector& propagate_down, const vector*>& bottom) { if (propagate_down[0]) { const Dtype* top_data = top[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); @@ -37,10 +37,36 @@ void TanHLayer::Backward_cpu(const vector*>& top, } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void TanHLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + TanHForward(count, bottom_data, top_data); +} + +template +void TanHLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + if (propagate_down[0]) { + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + TanHBackward(count, top_diff, top_data, bottom_diff); + } +} +// end: code modified for OpenCL port + +#else STUB_GPU(TanHLayer); #endif -INSTANTIATE_CLASS(TanHLayer); +INSTANTIATE_CLASS (TanHLayer); } // namespace caffe diff --git a/src/caffe/layers/tanh_layer.cu b/src/caffe/layers/tanh_layer.cu deleted file mode 100644 index ccd6e63e..00000000 --- a/src/caffe/layers/tanh_layer.cu +++ /dev/null @@ -1,59 +0,0 @@ -// TanH neuron activation function layer. -// Adapted from ReLU layer code written by Yangqing Jia - -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void TanHForward(const int n, const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = tanh(in[index]); - } -} - -template -void TanHLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - TanHForward<<>>( - count, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; -} - -template -__global__ void TanHBackward(const int n, const Dtype* in_diff, - const Dtype* out_data, Dtype* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - Dtype tanhx = out_data[index]; - out_diff[index] = in_diff[index] * (1 - tanhx * tanhx); - } -} - -template -void TanHLayer::Backward_gpu(const vector*>& top, - const vector& propagate_down, - const vector*>& bottom) { - if (propagate_down[0]) { - const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - TanHBackward<<>>( - count, top_diff, top_data, bottom_diff); - CUDA_POST_KERNEL_CHECK; - } -} - -INSTANTIATE_LAYER_GPU_FUNCS(TanHLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/threshold_layer.cpp b/src/caffe/layers/threshold_layer.cpp index 2365e7b9..eebc379a 100644 --- a/src/caffe/layers/threshold_layer.cpp +++ b/src/caffe/layers/threshold_layer.cpp @@ -2,14 +2,14 @@ #include "caffe/layer.hpp" #include "caffe/vision_layers.hpp" - +#include "caffe/util/ocl_wrapper.hpp" namespace caffe { template void ThresholdLayer::LayerSetUp(const vector*>& bottom, - const vector*>& top) { - NeuronLayer::LayerSetUp(bottom, top); + const vector*>& top) { + NeuronLayer < Dtype > ::LayerSetUp(bottom, top); threshold_ = this->layer_param_.threshold_param().threshold(); } @@ -24,11 +24,24 @@ void ThresholdLayer::Forward_cpu(const vector*>& bottom, } } -#ifdef CPU_ONLY +#ifndef CPU_ONLY +// begin: code modified for OpenCL port +template +void ThresholdLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const int count = bottom[0]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + ThresholdForward(count, threshold_, bottom_data, top_data); +} +// end: code modified for OpenCL port + +#else STUB_GPU_FORWARD(ThresholdLayer, Forward); #endif -INSTANTIATE_CLASS(ThresholdLayer); -REGISTER_LAYER_CLASS(Threshold); +INSTANTIATE_CLASS (ThresholdLayer); +REGISTER_LAYER_CLASS (Threshold); } // namespace caffe diff --git a/src/caffe/layers/threshold_layer.cu b/src/caffe/layers/threshold_layer.cu deleted file mode 100644 index bfa7f159..00000000 --- a/src/caffe/layers/threshold_layer.cu +++ /dev/null @@ -1,33 +0,0 @@ -#include -#include - -#include "caffe/layer.hpp" -#include "caffe/vision_layers.hpp" - -namespace caffe { - -template -__global__ void ThresholdForward(const int n, const Dtype threshold, - const Dtype* in, Dtype* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] > threshold ? 1 : 0; - } -} - -template -void ThresholdLayer::Forward_gpu(const vector*>& bottom, - const vector*>& top) { - const Dtype* bottom_data = bottom[0]->gpu_data(); - Dtype* top_data = top[0]->mutable_gpu_data(); - const int count = bottom[0]->count(); - // NOLINT_NEXT_LINE(whitespace/operators) - ThresholdForward<<>>( - count, threshold_, bottom_data, top_data); - CUDA_POST_KERNEL_CHECK; -} - - -INSTANTIATE_LAYER_GPU_FORWARD(ThresholdLayer); - - -} // namespace caffe diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp index c127d56b..7085ac63 100644 --- a/src/caffe/layers/window_data_layer.cpp +++ b/src/caffe/layers/window_data_layer.cpp @@ -32,7 +32,7 @@ WindowDataLayer::~WindowDataLayer() { template void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // LayerSetUp runs through the window_file and creates two structures // that hold windows: one for foreground (object) windows and one // for background (non-object) windows. We use an overlap threshold @@ -63,9 +63,8 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, cache_images_ = this->layer_param_.window_data_param().cache_images(); string root_folder = this->layer_param_.window_data_param().root_folder(); - const bool prefetch_needs_rand = - this->transform_param_.mirror() || - this->transform_param_.crop_size(); + const bool prefetch_needs_rand = this->transform_param_.mirror() + || this->transform_param_.crop_size(); if (prefetch_needs_rand) { const unsigned int prefetch_rng_seed = caffe_rng_rand(); prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); @@ -143,21 +142,18 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, } if (image_index % 100 == 0) { - LOG(INFO) << "num: " << image_index << " " - << image_path << " " - << image_size[0] << " " - << image_size[1] << " " - << image_size[2] << " " - << "windows to process: " << num_windows; + LOG(INFO) << "num: " << image_index << " " << image_path << " " + << image_size[0] << " " << image_size[1] << " " << image_size[2] + << " " << "windows to process: " << num_windows; } } while (infile >> hashtag >> image_index); - LOG(INFO) << "Number of images: " << image_index+1; + LOG(INFO) << "Number of images: " << image_index + 1; - for (map::iterator it = label_hist.begin(); - it != label_hist.end(); ++it) { + for (map::iterator it = label_hist.begin(); it != label_hist.end(); + ++it) { LOG(INFO) << "class " << it->first << " has " << label_hist[it->first] - << " samples"; + << " samples"; } LOG(INFO) << "Amount of context padding: " @@ -185,21 +181,20 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, has_mean_file_ = this->transform_param_.has_mean_file(); has_mean_values_ = this->transform_param_.mean_value_size() > 0; if (has_mean_file_) { - const string& mean_file = - this->transform_param_.mean_file(); + const string& mean_file = this->transform_param_.mean_file(); LOG(INFO) << "Loading mean file from: " << mean_file; BlobProto blob_proto; ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto); data_mean_.FromProto(blob_proto); } if (has_mean_values_) { - CHECK(has_mean_file_ == false) << - "Cannot specify mean_file and mean_value at the same time"; + CHECK(has_mean_file_ == false) + << "Cannot specify mean_file and mean_value at the same time"; for (int c = 0; c < this->transform_param_.mean_value_size(); ++c) { mean_values_.push_back(this->transform_param_.mean_value(c)); } - CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) << - "Specify either 1 mean_value or as many as channels: " << channels; + CHECK(mean_values_.size() == 1 || mean_values_.size() == channels) + << "Specify either 1 mean_value or as many as channels: " << channels; if (channels > 1 && mean_values_.size() == 1) { // Replicate the mean_value for simplicity for (int c = 1; c < channels; ++c) { @@ -211,7 +206,7 @@ void WindowDataLayer::DataLayerSetUp(const vector*>& bottom, template unsigned int WindowDataLayer::PrefetchRand() { - CHECK(prefetch_rng_); + CHECK (prefetch_rng_); caffe::rng_t* prefetch_rng = static_cast(prefetch_rng_->generator()); return (*prefetch_rng)(); @@ -265,20 +260,21 @@ void WindowDataLayer::InternalThreadEntry() { // sample a window timer.Start(); const unsigned int rand_index = PrefetchRand(); - vector window = (is_fg) ? - fg_windows_[rand_index % fg_windows_.size()] : - bg_windows_[rand_index % bg_windows_.size()]; + vector window = + (is_fg) ? + fg_windows_[rand_index % fg_windows_.size()] : + bg_windows_[rand_index % bg_windows_.size()]; bool do_mirror = mirror && PrefetchRand() % 2; // load the image containing the window pair > image = - image_database_[window[WindowDataLayer::IMAGE_INDEX]]; + image_database_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; cv::Mat cv_img; if (this->cache_images_) { - pair image_cached = - image_database_cache_[window[WindowDataLayer::IMAGE_INDEX]]; + pair < std::string, Datum > image_cached = + image_database_cache_[window[WindowDataLayer < Dtype > ::IMAGE_INDEX]]; cv_img = DecodeDatumToCVMat(image_cached.second, true); } else { cv_img = cv::imread(image.first, CV_LOAD_IMAGE_COLOR); @@ -292,10 +288,10 @@ void WindowDataLayer::InternalThreadEntry() { const int channels = cv_img.channels(); // crop window out of image and warp it - int x1 = window[WindowDataLayer::X1]; - int y1 = window[WindowDataLayer::Y1]; - int x2 = window[WindowDataLayer::X2]; - int y2 = window[WindowDataLayer::Y2]; + int x1 = window[WindowDataLayer < Dtype > ::X1]; + int y1 = window[WindowDataLayer < Dtype > ::Y1]; + int x2 = window[WindowDataLayer < Dtype > ::X2]; + int y2 = window[WindowDataLayer < Dtype > ::Y2]; int pad_w = 0; int pad_h = 0; @@ -303,12 +299,12 @@ void WindowDataLayer::InternalThreadEntry() { // scale factor by which to expand the original region // such that after warping the expanded region to crop_size x crop_size // there's exactly context_pad amount of padding on each side - Dtype context_scale = static_cast(crop_size) / - static_cast(crop_size - 2*context_pad); + Dtype context_scale = static_cast(crop_size) + / static_cast(crop_size - 2 * context_pad); // compute the expanded region - Dtype half_height = static_cast(y2-y1+1)/2.0; - Dtype half_width = static_cast(x2-x1+1)/2.0; + Dtype half_height = static_cast(y2 - y1 + 1) / 2.0; + Dtype half_width = static_cast(x2 - x1 + 1) / 2.0; Dtype center_x = static_cast(x1) + half_width; Dtype center_y = static_cast(y1) + half_height; if (use_square) { @@ -318,16 +314,16 @@ void WindowDataLayer::InternalThreadEntry() { half_height = half_width; } } - x1 = static_cast(round(center_x - half_width*context_scale)); - x2 = static_cast(round(center_x + half_width*context_scale)); - y1 = static_cast(round(center_y - half_height*context_scale)); - y2 = static_cast(round(center_y + half_height*context_scale)); + x1 = static_cast(round(center_x - half_width * context_scale)); + x2 = static_cast(round(center_x + half_width * context_scale)); + y1 = static_cast(round(center_y - half_height * context_scale)); + y2 = static_cast(round(center_y + half_height * context_scale)); // the expanded region may go outside of the image // so we compute the clipped (expanded) region and keep track of // the extent beyond the image - int unclipped_height = y2-y1+1; - int unclipped_width = x2-x1+1; + int unclipped_height = y2 - y1 + 1; + int unclipped_width = x2 - x1 + 1; int pad_x1 = std::max(0, -x1); int pad_y1 = std::max(0, -y1); int pad_x2 = std::max(0, x2 - cv_img.cols + 1); @@ -342,25 +338,25 @@ void WindowDataLayer::InternalThreadEntry() { CHECK_LT(x2, cv_img.cols); CHECK_LT(y2, cv_img.rows); - int clipped_height = y2-y1+1; - int clipped_width = x2-x1+1; + int clipped_height = y2 - y1 + 1; + int clipped_width = x2 - x1 + 1; // scale factors that would be used to warp the unclipped // expanded region - Dtype scale_x = - static_cast(crop_size)/static_cast(unclipped_width); - Dtype scale_y = - static_cast(crop_size)/static_cast(unclipped_height); + Dtype scale_x = static_cast(crop_size) + / static_cast(unclipped_width); + Dtype scale_y = static_cast(crop_size) + / static_cast(unclipped_height); // size to warp the clipped expanded region to - cv_crop_size.width = - static_cast(round(static_cast(clipped_width)*scale_x)); - cv_crop_size.height = - static_cast(round(static_cast(clipped_height)*scale_y)); - pad_x1 = static_cast(round(static_cast(pad_x1)*scale_x)); - pad_x2 = static_cast(round(static_cast(pad_x2)*scale_x)); - pad_y1 = static_cast(round(static_cast(pad_y1)*scale_y)); - pad_y2 = static_cast(round(static_cast(pad_y2)*scale_y)); + cv_crop_size.width = static_cast(round( + static_cast(clipped_width) * scale_x)); + cv_crop_size.height = static_cast(round( + static_cast(clipped_height) * scale_y)); + pad_x1 = static_cast(round(static_cast(pad_x1) * scale_x)); + pad_x2 = static_cast(round(static_cast(pad_x2) * scale_x)); + pad_y1 = static_cast(round(static_cast(pad_y1) * scale_y)); + pad_y2 = static_cast(round(static_cast(pad_y2) * scale_y)); pad_h = pad_y1; // if we're mirroring, we mirror the padding too (to be pedantic) @@ -380,10 +376,10 @@ void WindowDataLayer::InternalThreadEntry() { } } - cv::Rect roi(x1, y1, x2-x1+1, y2-y1+1); + cv::Rect roi(x1, y1, x2 - x1 + 1, y2 - y1 + 1); cv::Mat cv_cropped_img = cv_img(roi); - cv::resize(cv_cropped_img, cv_cropped_img, - cv_crop_size, 0, 0, cv::INTER_LINEAR); + cv::resize(cv_cropped_img, cv_cropped_img, cv_crop_size, 0, 0, + cv::INTER_LINEAR); // horizontal flip at random if (do_mirror) { @@ -392,17 +388,17 @@ void WindowDataLayer::InternalThreadEntry() { // copy the warped window into top_data for (int h = 0; h < cv_cropped_img.rows; ++h) { - const uchar* ptr = cv_cropped_img.ptr(h); + const uchar* ptr = cv_cropped_img.ptr < uchar > (h); int img_index = 0; for (int w = 0; w < cv_cropped_img.cols; ++w) { for (int c = 0; c < channels; ++c) { int top_index = ((item_id * channels + c) * crop_size + h + pad_h) - * crop_size + w + pad_w; + * crop_size + w + pad_w; // int top_index = (c * height + h) * width + w; Dtype pixel = static_cast(ptr[img_index++]); if (this->has_mean_file_) { int mean_index = (c * mean_height + h + mean_off + pad_h) - * mean_width + w + mean_off + pad_w; + * mean_width + w + mean_off + pad_w; top_data[top_index] = (pixel - mean[mean_index]) * scale; } else { if (this->has_mean_values_) { @@ -416,40 +412,7 @@ void WindowDataLayer::InternalThreadEntry() { } trans_time += timer.MicroSeconds(); // get window label - top_label[item_id] = window[WindowDataLayer::LABEL]; - - #if 0 - // useful debugging code for dumping transformed windows to disk - string file_id; - std::stringstream ss; - ss << PrefetchRand(); - ss >> file_id; - std::ofstream inf((string("dump/") + file_id + - string("_info.txt")).c_str(), std::ofstream::out); - inf << image.first << std::endl - << window[WindowDataLayer::X1]+1 << std::endl - << window[WindowDataLayer::Y1]+1 << std::endl - << window[WindowDataLayer::X2]+1 << std::endl - << window[WindowDataLayer::Y2]+1 << std::endl - << do_mirror << std::endl - << top_label[item_id] << std::endl - << is_fg << std::endl; - inf.close(); - std::ofstream top_data_file((string("dump/") + file_id + - string("_data.txt")).c_str(), - std::ofstream::out | std::ofstream::binary); - for (int c = 0; c < channels; ++c) { - for (int h = 0; h < crop_size; ++h) { - for (int w = 0; w < crop_size; ++w) { - top_data_file.write(reinterpret_cast( - &top_data[((item_id * channels + c) * crop_size + h) - * crop_size + w]), - sizeof(Dtype)); - } - } - } - top_data_file.close(); - #endif + top_label[item_id] = window[WindowDataLayer < Dtype > ::LABEL]; item_id++; } @@ -460,7 +423,7 @@ void WindowDataLayer::InternalThreadEntry() { DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; } -INSTANTIATE_CLASS(WindowDataLayer); -REGISTER_LAYER_CLASS(WindowData); +INSTANTIATE_CLASS (WindowDataLayer); +REGISTER_LAYER_CLASS (WindowData); } // namespace caffe diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index a18ee638..711ec408 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -13,6 +13,7 @@ #include "caffe/util/io.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/util/upgrade_proto.hpp" +#include "caffe/util/benchmark.hpp" #include "caffe/test/test_caffe_main.hpp" @@ -40,14 +41,14 @@ void Net::Init(const NetParameter& in_param) { NetParameter filtered_param; FilterNet(in_param, &filtered_param); LOG(INFO) << "Initializing net from parameters: " << std::endl - << filtered_param.DebugString(); + << filtered_param.DebugString(); // Create a copy of filtered_param with splits added where necessary. NetParameter param; InsertSplits(filtered_param, ¶m); // Basically, build all the layers and set up their connections. name_ = param.name(); map blob_name_to_idx; - set available_blobs; + set < string > available_blobs; CHECK(param.input_dim_size() == 0 || param.input_shape_size() == 0) << "Must specify either input_shape OR deprecated input_dim, not both."; if (param.input_dim_size() > 0) { @@ -80,21 +81,20 @@ void Net::Init(const NetParameter& in_param) { // Setup layer. const LayerParameter& layer_param = param.layer(layer_id); if (layer_param.propagate_down_size() > 0) { - CHECK_EQ(layer_param.propagate_down_size(), - layer_param.bottom_size()) + CHECK_EQ(layer_param.propagate_down_size(), layer_param.bottom_size()) << "propagate_down param must be specified " << "either 0 or bottom_size times "; } - layers_.push_back(LayerRegistry::CreateLayer(layer_param)); + layers_.push_back(LayerRegistry < Dtype > ::CreateLayer(layer_param)); layer_names_.push_back(layer_param.name()); LOG(INFO) << "Creating Layer " << layer_param.name(); bool need_backward = false; // Figure out this layer's input and output for (int bottom_id = 0; bottom_id < layer_param.bottom_size(); - ++bottom_id) { + ++bottom_id) { const int blob_id = AppendBottom(param, layer_id, bottom_id, - &available_blobs, &blob_name_to_idx); + &available_blobs, &blob_name_to_idx); // If a blob needs backward, this layer should provide it. need_backward |= blob_need_backward_[blob_id]; } @@ -105,10 +105,10 @@ void Net::Init(const NetParameter& in_param) { // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter // specified fewer than the required number (as specified by // ExactNumTopBlobs() or MinTopBlobs()), allocate them here. - Layer* layer = layers_[layer_id].get(); + Layer < Dtype > *layer = layers_[layer_id].get(); if (layer->AutoTopBlobs()) { - const int needed_num_top = - std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs()); + const int needed_num_top = std::max(layer->MinTopBlobs(), + layer->ExactNumTopBlobs()); for (; num_top < needed_num_top; ++num_top) { // Add "anonymous" top blobs -- do not modify available_blobs or // blob_name_to_idx as we don't want these blobs to be usable as input @@ -137,12 +137,13 @@ void Net::Init(const NetParameter& in_param) { << "Too many params specified for layer " << layer_param.name(); ParamSpec default_param_spec; for (int param_id = 0; param_id < num_param_blobs; ++param_id) { - const ParamSpec* param_spec = (param_id < param_size) ? - &layer_param.param(param_id) : &default_param_spec; + const ParamSpec* param_spec = + (param_id < param_size) ? + &layer_param.param(param_id) : &default_param_spec; const bool param_need_backward = param_spec->lr_mult() > 0; need_backward |= param_need_backward; layers_[layer_id]->set_param_propagate_down(param_id, - param_need_backward); + param_need_backward); } for (int param_id = 0; param_id < num_param_blobs; ++param_id) { AppendParam(param, layer_id, param_id); @@ -161,15 +162,15 @@ void Net::Init(const NetParameter& in_param) { // Also checks if all bottom blobs don't need backward computation (possible // because the skip_propagate_down param) and so we can skip bacward // computation for the entire layer - set blobs_under_loss; - set blobs_skip_backp; + set < string > blobs_under_loss; + set < string > blobs_skip_backp; for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) { bool layer_contributes_loss = false; bool layer_skip_propagate_down = true; for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; - if (layers_[layer_id]->loss(top_id) || - (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) { + if (layers_[layer_id]->loss(top_id) + || (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) { layer_contributes_loss = true; } if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) { @@ -183,19 +184,21 @@ void Net::Init(const NetParameter& in_param) { if (layer_need_backward_[layer_id] && layer_skip_propagate_down) { layer_need_backward_[layer_id] = false; for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); - ++bottom_id) { + ++bottom_id) { bottom_need_backward_[layer_id][bottom_id] = false; } } - if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; } + if (!layer_contributes_loss) { + layer_need_backward_[layer_id] = false; + } if (layer_need_backward_[layer_id]) { LOG(INFO) << layer_names_[layer_id] << " needs backward computation."; } else { LOG(INFO) << layer_names_[layer_id] - << " does not need backward computation."; + << " does not need backward computation."; } for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size(); - ++bottom_id) { + ++bottom_id) { if (layer_contributes_loss) { const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; @@ -205,7 +208,7 @@ void Net::Init(const NetParameter& in_param) { } if (!bottom_need_backward_[layer_id][bottom_id]) { const string& blob_name = - blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; blobs_skip_backp.insert(blob_name); } } @@ -215,16 +218,16 @@ void Net::Init(const NetParameter& in_param) { for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) { layer_need_backward_[layer_id] = true; for (int bottom_id = 0; - bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) { + bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) { bottom_need_backward_[layer_id][bottom_id] = - bottom_need_backward_[layer_id][bottom_id] || - layers_[layer_id]->AllowForceBackward(bottom_id); + bottom_need_backward_[layer_id][bottom_id] + || layers_[layer_id]->AllowForceBackward(bottom_id); blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] = - blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] || - bottom_need_backward_[layer_id][bottom_id]; + blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] + || bottom_need_backward_[layer_id][bottom_id]; } for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { + ++param_id) { layers_[layer_id]->set_param_propagate_down(param_id, true); } } @@ -258,7 +261,7 @@ void Net::FilterNet(const NetParameter& param, const LayerParameter& layer_param = param.layer(i); const string& layer_name = layer_param.name(); CHECK(layer_param.include_size() == 0 || layer_param.exclude_size() == 0) - << "Specify either include rules or exclude rules; not both."; + << "Specify either include rules or exclude rules; not both."; // If no include rules are specified, the layer is included by default and // only excluded if it meets one of the exclude rules. bool layer_included = (layer_param.include_size() == 0); @@ -279,16 +282,16 @@ void Net::FilterNet(const NetParameter& param, } template -bool Net::StateMeetsRule(const NetState& state, - const NetStateRule& rule, const string& layer_name) { +bool Net::StateMeetsRule(const NetState& state, const NetStateRule& rule, + const string& layer_name) { // Check whether the rule is broken due to phase. if (rule.has_phase()) { - if (rule.phase() != state.phase()) { - LOG(INFO) << "The NetState phase (" << state.phase() + if (rule.phase() != state.phase()) { + LOG(INFO) << "The NetState phase (" << state.phase() << ") differed from the phase (" << rule.phase() << ") specified by a rule in layer " << layer_name; - return false; - } + return false; + } } // Check whether the rule is broken due to min level. if (rule.has_min_level()) { @@ -314,11 +317,13 @@ bool Net::StateMeetsRule(const NetState& state, // Check that the NetState contains the rule's ith stage. bool has_stage = false; for (int j = 0; !has_stage && j < state.stage_size(); ++j) { - if (rule.stage(i) == state.stage(j)) { has_stage = true; } + if (rule.stage(i) == state.stage(j)) { + has_stage = true; + } } if (!has_stage) { LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i) - << "' specified by a rule in layer " << layer_name; + << "' specified by a rule in layer " << layer_name; return false; } } @@ -328,11 +333,13 @@ bool Net::StateMeetsRule(const NetState& state, // Check that the NetState contains the rule's ith not_stage. bool has_stage = false; for (int j = 0; !has_stage && j < state.stage_size(); ++j) { - if (rule.not_stage(i) == state.stage(j)) { has_stage = true; } + if (rule.not_stage(i) == state.stage(j)) { + has_stage = true; + } } if (has_stage) { LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i) - << "' specified by a rule in layer " << layer_name; + << "' specified by a rule in layer " << layer_name; return false; } } @@ -343,22 +350,25 @@ bool Net::StateMeetsRule(const NetState& state, // layer_id == -1, tops have layer_id >= 0.) template void Net::AppendTop(const NetParameter& param, const int layer_id, - const int top_id, set* available_blobs, - map* blob_name_to_idx) { - shared_ptr layer_param((layer_id >= 0) ? - (new LayerParameter(param.layer(layer_id))) : NULL); - const string& blob_name = layer_param ? - (layer_param->top_size() > top_id ? - layer_param->top(top_id) : "(automatic)") : param.input(top_id); + const int top_id, set* available_blobs, + map* blob_name_to_idx) { + shared_ptr < LayerParameter + > layer_param( + (layer_id >= 0) ? (new LayerParameter(param.layer(layer_id))) : NULL); + const string& blob_name = + layer_param ? + (layer_param->top_size() > top_id ? + layer_param->top(top_id) : "(automatic)") : + param.input(top_id); // Check if we are doing in-place computation - if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id && - blob_name == layer_param->bottom(top_id)) { + if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id + && blob_name == layer_param->bottom(top_id)) { // In-place computation LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)"; top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get()); top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]); - } else if (blob_name_to_idx && - blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) { + } else if (blob_name_to_idx + && blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) { // If we are not doing in-place computation but have duplicated blobs, // raise an error. LOG(FATAL) << "Duplicate blobs produced by multiple sources."; @@ -369,19 +379,20 @@ void Net::AppendTop(const NetParameter& param, const int layer_id, } else { LOG(INFO) << "Input " << top_id << " -> " << blob_name; } - shared_ptr > blob_pointer(new Blob()); + shared_ptr < Blob > blob_pointer(new Blob()); const int blob_id = blobs_.size(); blobs_.push_back(blob_pointer); blob_names_.push_back(blob_name); blob_need_backward_.push_back(false); - if (blob_name_to_idx) { (*blob_name_to_idx)[blob_name] = blob_id; } + if (blob_name_to_idx) { + (*blob_name_to_idx)[blob_name] = blob_id; + } if (layer_id == -1) { // Set the (explicitly specified) dimensions of the input blob. if (param.input_dim_size() > 0) { blob_pointer->Reshape(param.input_dim(top_id * 4), - param.input_dim(top_id * 4 + 1), - param.input_dim(top_id * 4 + 2), - param.input_dim(top_id * 4 + 3)); + param.input_dim(top_id * 4 + 1), param.input_dim(top_id * 4 + 2), + param.input_dim(top_id * 4 + 3)); } else { blob_pointer->Reshape(param.input_shape(top_id)); } @@ -392,7 +403,9 @@ void Net::AppendTop(const NetParameter& param, const int layer_id, top_vecs_[layer_id].push_back(blob_pointer.get()); } } - if (available_blobs) { available_blobs->insert(blob_name); } + if (available_blobs) { + available_blobs->insert(blob_name); + } } // Helper for Net::Init: add a new bottom blob to the net. @@ -403,8 +416,8 @@ int Net::AppendBottom(const NetParameter& param, const int layer_id, const LayerParameter& layer_param = param.layer(layer_id); const string& blob_name = layer_param.bottom(bottom_id); if (available_blobs->find(blob_name) == available_blobs->end()) { - LOG(FATAL) << "Unknown blob input " << blob_name - << " (at index " << bottom_id << ") to layer " << layer_id; + LOG(FATAL) << "Unknown blob input " << blob_name << " (at index " + << bottom_id << ") to layer " << layer_id; } const int blob_id = (*blob_name_to_idx)[blob_name]; LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name; @@ -415,15 +428,14 @@ int Net::AppendBottom(const NetParameter& param, const int layer_id, // Check if the backpropagation on bottom_id should be skipped if (layer_param.propagate_down_size() > 0) propagate_down = layer_param.propagate_down(bottom_id); - const bool need_backward = blob_need_backward_[blob_id] && - propagate_down; + const bool need_backward = blob_need_backward_[blob_id] && propagate_down; bottom_need_backward_[layer_id].push_back(need_backward); return blob_id; } template void Net::AppendParam(const NetParameter& param, const int layer_id, - const int param_id) { + const int param_id) { const LayerParameter& layer_param = layers_[layer_id]->layer_param(); const int param_size = layer_param.param_size(); string param_name = @@ -439,8 +451,9 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, params_.push_back(layers_[layer_id]->blobs()[param_id]); param_id_vecs_[layer_id].push_back(net_param_id); param_layer_indices_.push_back(make_pair(layer_id, param_id)); - if (!param_size || !param_name.size() || (param_name.size() && - param_names_index_.find(param_name) == param_names_index_.end())) { + if (!param_size || !param_name.size() + || (param_name.size() + && param_names_index_.find(param_name) == param_names_index_.end())) { // This layer "owns" this parameter blob -- it is either anonymous // (i.e., not given a param_name) or explicitly given a name that we // haven't already seen. @@ -452,19 +465,19 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, // Named param blob with name we've seen before: share params const int owner_net_param_id = param_names_index_[param_name]; param_owners_.push_back(owner_net_param_id); - const pair& owner_index = - param_layer_indices_[owner_net_param_id]; + const pair& owner_index = param_layer_indices_[owner_net_param_id]; const int owner_layer_id = owner_index.first; const int owner_param_id = owner_index.second; LOG(INFO) << "Sharing parameters '" << param_name << "' owned by " - << "layer '" << layer_names_[owner_layer_id] << "', param " - << "index " << owner_param_id; - Blob* this_blob = layers_[layer_id]->blobs()[param_id].get(); - Blob* owner_blob = + << "layer '" << layer_names_[owner_layer_id] << "', param " << "index " + << owner_param_id; + Blob < Dtype > *this_blob = layers_[layer_id]->blobs()[param_id].get(); + Blob < Dtype > *owner_blob = layers_[owner_layer_id]->blobs()[owner_param_id].get(); const int param_size = layer_param.param_size(); - if (param_size > param_id && (layer_param.param(param_id).share_mode() == - ParamSpec_DimCheckMode_PERMISSIVE)) { + if (param_size > param_id + && (layer_param.param(param_id).share_mode() + == ParamSpec_DimCheckMode_PERMISSIVE)) { // Permissive dimension checking -- only check counts are the same. CHECK_EQ(this_blob->count(), owner_blob->count()) << "Shared parameter blobs must have the same count."; @@ -482,11 +495,11 @@ void Net::GetLearningRateAndWeightDecay() { LOG(INFO) << "Collecting Learning Rate and Weight Decay."; ParamSpec default_param_spec; for (int i = 0; i < layers_.size(); ++i) { - vector > >& layer_blobs = layers_[i]->blobs(); + vector < shared_ptr > > &layer_blobs = layers_[i]->blobs(); for (int j = 0; j < layer_blobs.size(); ++j) { const ParamSpec* param_spec = (layers_[i]->layer_param().param_size() > j) ? - &layers_[i]->layer_param().param(j) : &default_param_spec; + &layers_[i]->layer_param().param(j) : &default_param_spec; params_lr_.push_back(param_spec->lr_mult()); params_weight_decay_.push_back(param_spec->decay_mult()); } @@ -503,12 +516,29 @@ Dtype Net::ForwardFromTo(int start, int end) { InputDebugInfo(i); } } + + CPUTimer forward_timer; + CPUTimer layer_timer; + forward_timer.Start(); + for (int i = start; i <= end; ++i) { - // LOG(ERROR) << "Forwarding " << layer_names_[i]; + layer_timer.Start(); Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]); loss += layer_loss; - if (debug_info_) { ForwardDebugInfo(i); } + if (debug_info_) { + ForwardDebugInfo(i); + } +#ifndef CPU_ONLY + clFinish(amdDevice.CommandQueue); +#endif + layer_timer.Stop(); + printf("Forwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), + layer_timer.MilliSeconds()); } + + forward_timer.Stop(); + printf("Total Forward time: %f\n\n", forward_timer.MilliSeconds()); + return loss; } @@ -567,13 +597,30 @@ template void Net::BackwardFromTo(int start, int end) { CHECK_GE(end, 0); CHECK_LT(start, layers_.size()); + + CPUTimer backward_timer; + CPUTimer layer_timer; + backward_timer.Start(); + for (int i = start; i >= end; --i) { + layer_timer.Start(); if (layer_need_backward_[i]) { - layers_[i]->Backward( - top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]); - if (debug_info_) { BackwardDebugInfo(i); } + layers_[i]->Backward(top_vecs_[i], bottom_need_backward_[i], + bottom_vecs_[i]); + if (debug_info_) { + BackwardDebugInfo(i); + } +#ifndef CPU_ONLY + clFinish(amdDevice.CommandQueue); +#endif + layer_timer.Start(); + printf("Backwarding %s,\ttime %f ms\n", layer_names_[i].c_str(), + layer_timer.MilliSeconds()); } } + + backward_timer.Stop(); + printf("Total Backward time: %f\n\n", backward_timer.MilliSeconds()); } template @@ -581,8 +628,8 @@ void Net::InputDebugInfo(const int input_id) { const Blob& blob = *net_input_blobs_[input_id]; const string& blob_name = blob_names_[net_input_blob_indices_[input_id]]; const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Input " << blob_name << " data: " << data_abs_val_mean; + LOG(INFO) << " [Forward] " << "Input " << blob_name << " data: " + << data_abs_val_mean; } template @@ -591,19 +638,17 @@ void Net::ForwardDebugInfo(const int layer_id) { const Blob& blob = *top_vecs_[layer_id][top_id]; const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name - << " data: " << data_abs_val_mean; + LOG(INFO) << " [Forward] " << "Layer " << layer_names_[layer_id] + << ", top blob " << blob_name << " data: " << data_abs_val_mean; } for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { + ++param_id) { const Blob& blob = *layers_[layer_id]->blobs()[param_id]; const int net_param_id = param_id_vecs_[layer_id][param_id]; const string& blob_name = param_display_names_[net_param_id]; const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Forward] " - << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name - << " data: " << data_abs_val_mean; + LOG(INFO) << " [Forward] " << "Layer " << layer_names_[layer_id] + << ", param blob " << blob_name << " data: " << data_abs_val_mean; } } @@ -611,22 +656,24 @@ template void Net::BackwardDebugInfo(const int layer_id) { const vector*>& bottom_vec = bottom_vecs_[layer_id]; for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) { - if (!bottom_need_backward_[layer_id][bottom_id]) { continue; } + if (!bottom_need_backward_[layer_id][bottom_id]) { + continue; + } const Blob& blob = *bottom_vec[bottom_id]; const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name - << " diff: " << diff_abs_val_mean; + LOG(INFO) << " [Backward] " << "Layer " << layer_names_[layer_id] + << ", bottom blob " << blob_name << " diff: " << diff_abs_val_mean; } for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); - ++param_id) { - if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; } + ++param_id) { + if (!layers_[layer_id]->param_propagate_down(param_id)) { + continue; + } const Blob& blob = *layers_[layer_id]->blobs()[param_id]; const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); - LOG(INFO) << " [Backward] " - << "Layer " << layer_names_[layer_id] << ", param blob " << param_id - << " diff: " << diff_abs_val_mean; + LOG(INFO) << " [Backward] " << "Layer " << layer_names_[layer_id] + << ", param blob " << param_id << " diff: " << diff_abs_val_mean; } } @@ -639,15 +686,14 @@ void Net::UpdateDebugInfo(const int param_id) { const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count(); if (param_owner < 0) { const Dtype data_abs_val_mean = blob.asum_data() / blob.count(); - LOG(INFO) << " [Update] Layer " << layer_name - << ", param " << param_display_name - << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean; + LOG(INFO) << " [Update] Layer " << layer_name << ", param " + << param_display_name << " data: " << data_abs_val_mean << "; diff: " + << diff_abs_val_mean; } else { const string& owner_layer_name = layer_names_[param_layer_indices_[param_owner].first]; - LOG(INFO) << " [Update] Layer " << layer_name - << ", param blob " << param_display_name - << " (owned by layer " << owner_layer_name << ", " + LOG(INFO) << " [Update] Layer " << layer_name << ", param blob " + << param_display_name << " (owned by layer " << owner_layer_name << ", " << "param " << param_display_names_[param_owners_[param_id]] << ")" << " diff: " << diff_abs_val_mean; } @@ -657,11 +703,11 @@ template void Net::ShareTrainedLayersWith(const Net* other) { int num_source_layers = other->layers().size(); for (int i = 0; i < num_source_layers; ++i) { - Layer* source_layer = other->layers()[i].get(); + Layer < Dtype > *source_layer = other->layers()[i].get(); const string& source_layer_name = other->layer_names()[i]; int target_layer_id = 0; - while (target_layer_id != layer_names_.size() && - layer_names_[target_layer_id] != source_layer_name) { + while (target_layer_id != layer_names_.size() + && layer_names_[target_layer_id] != source_layer_name) { ++target_layer_id; } if (target_layer_id == layer_names_.size()) { @@ -669,12 +715,12 @@ void Net::ShareTrainedLayersWith(const Net* other) { continue; } DLOG(INFO) << "Copying source layer " << source_layer_name; - vector > >& target_blobs = + vector < shared_ptr > > &target_blobs = layers_[target_layer_id]->blobs(); CHECK_EQ(target_blobs.size(), source_layer->blobs().size()) << "Incompatible number of blobs for layer " << source_layer_name; for (int j = 0; j < target_blobs.size(); ++j) { - Blob* source_blob = source_layer->blobs()[j].get(); + Blob < Dtype > *source_blob = source_layer->blobs()[j].get(); CHECK(target_blobs[j]->shape() == source_blob->shape()); target_blobs[j]->ShareData(*source_blob); } @@ -697,7 +743,9 @@ void Net::Backward() { if (debug_info_) { Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0; for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] >= 0) { continue; } + if (param_owners_[i] >= 0) { + continue; + } asum_data += params_[i]->asum_data(); asum_diff += params_[i]->asum_diff(); sumsq_data += params_[i]->sumsq_data(); @@ -725,8 +773,8 @@ void Net::CopyTrainedLayersFrom(const NetParameter& param) { const LayerParameter& source_layer = param.layer(i); const string& source_layer_name = source_layer.name(); int target_layer_id = 0; - while (target_layer_id != layer_names_.size() && - layer_names_[target_layer_id] != source_layer_name) { + while (target_layer_id != layer_names_.size() + && layer_names_[target_layer_id] != source_layer_name) { ++target_layer_id; } if (target_layer_id == layer_names_.size()) { @@ -734,7 +782,7 @@ void Net::CopyTrainedLayersFrom(const NetParameter& param) { continue; } DLOG(INFO) << "Copying source layer " << source_layer_name; - vector > >& target_blobs = + vector < shared_ptr > > &target_blobs = layers_[target_layer_id]->blobs(); CHECK_EQ(target_blobs.size(), source_layer.blobs_size()) << "Incompatible number of blobs for layer " << source_layer_name; @@ -779,11 +827,18 @@ void Net::Update() { // diff. (Assumes that the learning rate, weight decay, etc. have already been // accounted for in the current diff.) for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] < 0) { continue; } - if (debug_info_) { UpdateDebugInfo(i); } + if (param_owners_[i] < 0) { + continue; + } + if (debug_info_) { + UpdateDebugInfo(i); + } const int count = params_[i]->count(); const Dtype* this_diff; Dtype* owner_diff; + this_diff = params_[i]->cpu_diff(); + owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); + switch (Caffe::mode()) { case Caffe::CPU: this_diff = params_[i]->cpu_diff(); @@ -794,7 +849,8 @@ void Net::Update() { #ifndef CPU_ONLY this_diff = params_[i]->gpu_diff(); owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); - caffe_gpu_add(count, this_diff, owner_diff, owner_diff); + // caffe_gpu_add(count, this_diff, owner_diff, owner_diff); + caffe_gpu_axpy < Dtype > (count, 1.0, this_diff, owner_diff); #else NO_GPU; #endif @@ -805,8 +861,12 @@ void Net::Update() { } // Now, update the owned parameters. for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] >= 0) { continue; } - if (debug_info_) { UpdateDebugInfo(i); } + if (param_owners_[i] >= 0) { + continue; + } + if (debug_info_) { + UpdateDebugInfo(i); + } params_[i]->Update(); } } @@ -819,11 +879,11 @@ bool Net::has_blob(const string& blob_name) const { template const shared_ptr > Net::blob_by_name( const string& blob_name) const { - shared_ptr > blob_ptr; + shared_ptr < Blob > blob_ptr; if (has_blob(blob_name)) { blob_ptr = blobs_[blob_names_index_.find(blob_name)->second]; } else { - blob_ptr.reset((Blob*)(NULL)); + blob_ptr.reset((Blob*) (NULL)); LOG(WARNING) << "Unknown blob name " << blob_name; } return blob_ptr; @@ -837,16 +897,16 @@ bool Net::has_layer(const string& layer_name) const { template const shared_ptr > Net::layer_by_name( const string& layer_name) const { - shared_ptr > layer_ptr; + shared_ptr < Layer > layer_ptr; if (has_layer(layer_name)) { layer_ptr = layers_[layer_names_index_.find(layer_name)->second]; } else { - layer_ptr.reset((Layer*)(NULL)); + layer_ptr.reset((Layer*) (NULL)); LOG(WARNING) << "Unknown layer name " << layer_name; } return layer_ptr; } -INSTANTIATE_CLASS(Net); +INSTANTIATE_CLASS (Net); } // namespace caffe diff --git a/src/caffe/ocl/bnll_layer.cl b/src/caffe/ocl/bnll_layer.cl new file mode 100644 index 00000000..99d04575 --- /dev/null +++ b/src/caffe/ocl/bnll_layer.cl @@ -0,0 +1,52 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +#define kBNLL_THRESHOLD 50.0 + +template +__kernel void BNLLForward(const int n, __global const T* in, __global T* out) { + int index = get_global_id(0); + if (index < n) { + out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index])); + } +} +template __attribute__((mangled_name(BNLLForward_float))) __kernel void BNLLForward(const int n, __global const float* in, __global float* out); +template __attribute__((mangled_name(BNLLForward_double))) __kernel void BNLLForward(const int n, __global const double* in, __global double* out); + +template +__kernel void BNLLBackward(const int n, __global const T* in_diff, + __global const T* in_data, __global T* out_diff) { + int index = get_global_id(0); + if (index < n) { + T expval = exp(min(in_data[index], T(kBNLL_THRESHOLD))); + out_diff[index] = in_diff[index] * expval / (expval + 1.); + } +} + +template __attribute__((mangled_name(BNLLBackward_float))) __kernel void BNLLBackward(const int n, __global const float* in_diff, + __global const float* in_data, __global float* out_diff); +template __attribute__((mangled_name(BNLLBackward_double))) __kernel void BNLLBackward(const int n, __global const double* in_diff, + __global const double* in_data, __global double* out_diff); diff --git a/src/caffe/ocl/concat_layer.cl b/src/caffe/ocl/concat_layer.cl new file mode 100644 index 00000000..a9663fce --- /dev/null +++ b/src/caffe/ocl/concat_layer.cl @@ -0,0 +1,54 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void Concat(const int nthreads, __global const T* in_data, + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global T* out_data) { + int index = get_global_id(0); + if(index < nthreads) { + const int total_concat_size = concat_size * bottom_concat_axis; + const int concat_num = index / total_concat_size; + const int concat_index = index % total_concat_size; + const int top_index = concat_index + + (concat_num * top_concat_axis + offset_concat_axis) * concat_size; + if (forward == 1) { + out_data[top_index] = in_data[index]; + } else { + out_data[index] = in_data[top_index]; + } + } +} + +template __attribute__((mangled_name(Concat_float))) __kernel void Concat(const int nthreads, __global const float* in_data, + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global float* out_data); +template __attribute__((mangled_name(Concat_double))) __kernel void Concat(const int nthreads, __global const double* in_data, + const int forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, __global double* out_data); diff --git a/src/caffe/ocl/contrastive_loss_layer.cl b/src/caffe/ocl/contrastive_loss_layer.cl new file mode 100644 index 00000000..477f2ff4 --- /dev/null +++ b/src/caffe/ocl/contrastive_loss_layer.cl @@ -0,0 +1,64 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void CLLBackward(const int count, const int channels, + const Dtype margin, const bool legacy_version, const Dtype alpha, + __global const Dtype* y, __global const Dtype* diff, __global const Dtype* dist_sq, + __global Dtype *bottom_diff) { + int i = get_global_id(0); + if(i < count) { + int n = i / channels; // the num index, to access y and dist_sq + if (static_cast(y[n])) { // similar pairs + bottom_diff[i] = alpha * diff[i]; + } else { // dissimilar pairs + Dtype mdist(0.0); + Dtype beta(0.0); + if (legacy_version) { + mdist = (margin - dist_sq[n]); + beta = -alpha; + } else { + Dtype dist = sqrt(dist_sq[n]); + mdist = (margin - dist); + beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i]; + } + if (mdist > 0.0) { + bottom_diff[i] = beta; + } else { + bottom_diff[i] = 0; + } + } + } +} + +template __attribute__((mangled_name(CLLBackward_float))) __kernel void CLLBackward(const int count, const int channels, + const float margin, const bool legacy_version, const float alpha, + __global const float* y, __global const float* diff, __global const float* dist_sq, + __global float *bottom_diff); +template __attribute__((mangled_name(CLLBackward_double))) __kernel void CLLBackward(const int count, const int channels, + const double margin, const bool legacy_version, const double alpha, + __global const double* y, __global const double* diff, __global const double* dist_sq, + __global double *bottom_diff); diff --git a/src/caffe/ocl/dropout_layer.cl b/src/caffe/ocl/dropout_layer.cl new file mode 100644 index 00000000..98d44f86 --- /dev/null +++ b/src/caffe/ocl/dropout_layer.cl @@ -0,0 +1,45 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void DropoutForward(const int n, __global T *in, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global T *out) { + int index = get_global_id(0); + if (index < n) { + out[index] = in[index] * scale * (mask[index] > threshold); + } +} +template __attribute__((mangled_name(DropoutForward_float))) __kernel void DropoutForward(const int n, __global float* in, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global float* out); +template __attribute__((mangled_name(DropoutForward_double))) __kernel void DropoutForward(const int n, __global double* in, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global double* out); + +template +__kernel void DropoutBackward(const int n, __global T *in_diff, __global const unsigned int *mask, const unsigned int threshold, const float scale, __global T *out_diff) { + int index = get_global_id(0); + if (index < n) { + out_diff[index] = in_diff[index] * scale * (mask[index] > threshold); + } +} +template __attribute__((mangled_name(DropoutBackward_float))) __kernel void DropoutBackward(const int n, __global float* in_diff, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global float* out_diff); +template __attribute__((mangled_name(DropoutBackward_double))) __kernel void DropoutBackward(const int n, __global double* in_diff, __global const unsigned int* mask, const unsigned int threshold, const float scale, __global double* out_diff); diff --git a/src/caffe/ocl/eltwise_layer.cl b/src/caffe/ocl/eltwise_layer.cl new file mode 100644 index 00000000..88137dd7 --- /dev/null +++ b/src/caffe/ocl/eltwise_layer.cl @@ -0,0 +1,73 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void MaxForward(const int nthreads, __global const Dtype* bottom_data_a, + __global const Dtype* bottom_data_b, const int blob_idx, __global Dtype* top_data, + __global int* mask) { + int index = get_global_id(0); + if(index < nthreads) { + Dtype maxval = -FLT_MAX; + int maxidx = -1; + if (bottom_data_a[index] > bottom_data_b[index]) { + // only update for very first bottom_data blob (blob_idx == 0) + if (blob_idx == 0) { + maxval = bottom_data_a[index]; + top_data[index] = maxval; + maxidx = blob_idx; + mask[index] = maxidx; + } + } else { + maxval = bottom_data_b[index]; + top_data[index] = maxval; + maxidx = blob_idx + 1; + mask[index] = maxidx; + } + } +} +template __attribute__((mangled_name(MaxForward_float))) __kernel void MaxForward(const int nthreads, __global const float* bottom_data_a, + __global const float* bottom_data_b, const int blob_idx, __global float* top_data, + __global int* mask); +template __attribute__((mangled_name(MaxForward_double))) __kernel void MaxForward(const int nthreads, __global const double* bottom_data_a, + __global const double* bottom_data_b, const int blob_idx, __global double* top_data, + __global int* mask); + +template +__kernel void MaxBackward(const int nthreads, __global const Dtype* top_diff, + const int blob_idx, __global const int* mask, __global Dtype* bottom_diff) { + int index = get_global_id(0); + if(index < nthreads) { + Dtype gradient = 0; + if (mask[index] == blob_idx) { + gradient += top_diff[index]; + } + bottom_diff[index] = gradient; + } +} +template __attribute__((mangled_name(MaxBackward_float))) __kernel void MaxBackward(const int nthreads, __global const float* top_diff, + const int blob_idx, __global const int* mask, __global float* bottom_diff); +template __attribute__((mangled_name(MaxBackward_double))) __kernel void MaxBackward(const int nthreads, __global const double* top_diff, + const int blob_idx, __global const int* mask, __global double* bottom_diff); diff --git a/src/caffe/ocl/im2col.cl b/src/caffe/ocl/im2col.cl new file mode 100644 index 00000000..f1a97eab --- /dev/null +++ b/src/caffe/ocl/im2col.cl @@ -0,0 +1,231 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ +template +__kernel void im2col_opt(const int n, __global T* data_im, const int channels, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global T* data_col, const int col_offset, const int optnum) { + + int index = get_global_id(0); + + data_im = data_im + img_offset; + data_col = data_col + col_offset; + + int x_out = index % width_col; + int y_out = (index / width_col) % height_col; + int channel_in = (index / width_col / height_col) % channels; + int channel_out = channel_in * kernel_h * kernel_w; + int im_id = index / width_col / height_col / channels; + + int y_in = y_out * stride_h - pad_h; + int x_in = x_out * stride_w - pad_w; + int offset_col = channel_out * optnum * height_col * width_col + im_id * height_col * width_col; + int offset_im = im_id * channels * height * width + channel_in * height * width; + + for(int k_h = 0; k_h < kernel_h; k_h++) { + for(int k_w = 0; k_w < kernel_w; k_w++) { + int x_im = x_in + k_w; + int y_im = y_in + k_h; + int index_im = y_im * width + x_im; + int index_col = (k_h * kernel_w + k_w) * optnum * height_col * width_col + y_out * width_col + x_out; + if(y_im >= 0 && y_im < height && x_im >= 0 && x_im < width) + data_col[offset_col + index_col] = data_im[offset_im + index_im]; + else + data_col[offset_col + index_col] = 0; + } + } +} + +template __attribute__((mangled_name(im2col_opt_float))) __kernel void im2col_opt(const int n, __global float* data_im, const int channels, const int lmg_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, __global float* data_col, const int col_offset, const int optnum); +template __attribute__((mangled_name(im2col_opt_double))) __kernel void im2col_opt(const int n, __global double* data_im, const int channels, const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int tride_h, const int stride_w, const int height_col, const int width_col, __global double* data_col, const int col_offset, const int optnum); + +template +__kernel void im2col(const int n, __global const T* data_im, const int img_offset, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_col, const int col_offset) { + data_im = data_im + img_offset; + data_col = data_col + col_offset; + + int index = get_global_id(0); + if(index < n) { + int w_out = index % width_col; + int h_index = index / width_col; + int h_out = h_index % height_col; + int channel_in = h_index / height_col; + int channel_out = channel_in * kernel_h * kernel_w; + int h_in = h_out * stride_h - pad_h; + int w_in = w_out * stride_w - pad_w; + __global T* data_col_ptr = data_col; + data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; + __global const T* data_im_ptr = data_im; + data_im_ptr += (channel_in * height + h_in) * width + w_in; + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + int h = h_in + i; + int w = w_in + j; + *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? + data_im_ptr[i * width + j] : 0; + data_col_ptr += height_col * width_col; + } + } + } +} + +template __attribute__((mangled_name(im2col_float))) void im2col(const int n, __global const float* data_im, + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global float* data_col, const int col_offset); +template __attribute__((mangled_name(im2col_double))) void im2col(const int n, __global const double* data_im, + const int img_offset, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int height_col, const int width_col, __global double* data_col, const int col_offset); + +template +__kernel void col2im(const int n, __global const T* data_col, const int col_offset, + const int height, const int width, const int channels, + const int patch_h, const int patch_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int height_col, const int width_col, + __global T* data_im, const int img_offset) { + data_col = data_col + col_offset; + data_im = data_im + img_offset; + int index = get_global_id(0); + if(index < n) { + T val = 0; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = index / (width * height); + // compute the start and end of the output + int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1; + int w_col_end = min(w / stride_w + 1, width_col); + int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; + int h_col_end = min(h / stride_h + 1, height_col); + // equivalent implementation + int offset = + (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; + int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; + int coeff_w_col = (1 - stride_w * height_col * width_col); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } +} + +template __attribute__((mangled_name(col2im_float))) __kernel void col2im(const int n, __global const float* data_col, const int col_offset, + const int height, const int width, const int channels, + const int patch_h, const int patch_w,const int pad_h, const int pad_w, + const int stride_h, const int stride_w,const int height_col, const int width_col, + __global float* data_im, const int img_offset); +template __attribute__((mangled_name(col2im_double))) __kernel void col2im(const int n, __global const double* data_col, + const int col_offset, const int height, const int width, const int channels, + const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int height_col, const int width_col, __global double* data_im, const int img_offset); + +template +__kernel void col2im_opt(const int n, __global T* data_col, const int col_offset, const int height, const int width, const int channels, +const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int height_col, const int width_col, __global T* data_im, const int img_offset, const int optnum) { + int index = get_global_id(0); + data_col = data_col + col_offset; + data_im = data_im + img_offset; + if(index < n) { + T val = 0; + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = index / (width * height) % channels; + int im = index / width / height / channels; + // compute the start and end of the output + int w_col_start = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + int w_col_end = min(w / stride_w + 1, width_col); + int h_col_start = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + int h_col_end = min(h / stride_h + 1, height_col); + // equivalent implementation + int offset = (c * kernel_h * kernel_w + h * kernel_w + w) * height_col * width_col * optnum + im * height_col * width_col; + int coeff_h_col = (1 - stride_h * kernel_w * height_col * optnum) * width_col; + int coeff_w_col = (1 - stride_w * height_col * width_col * optnum); + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + data_im[index] = val; + } +} +template __attribute__((mangled_name(col2im_opt_float))) __kernel void col2im_opt(const int n, __global float* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int height_col, const int width_col, __global float* data_im, const int img_offset, const int optnum); +template __attribute__((mangled_name(col2im_opt_double))) __kernel void col2im_opt(const int n, __global double* data_col, const int col_offset, const int height, const int width, const int channels, const int patch_h, const int patch_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w,const int height_col, const int width_col, __global double* data_im, const int img_offset, const int optnum); + +template +__kernel void opttrans(const int n, __global T* data_im, const int im_offset, const int height, const int width, const int channels, __global T* data_opt, const int opt_offset, const int optnum) { + + int index = get_global_id(0); + data_opt = data_opt + opt_offset; + data_im = data_im + im_offset; + if(index < n) { + int w = index % width; + int h = (index / width) % height; + int c = index / (width * height) % channels; + int im = index / width / height / channels; + + int opt_index = c * height * optnum * width + h * optnum * width + im * width + w; + data_opt[opt_index] = data_im[index]; + } +} +template __attribute__((mangled_name(opttrans_float))) __kernel void opttrans(const int n, __global float* data_im, const int im_offset, const int height, const int width, const int channels, __global float* data_opt, const int opt_offset, const int optnum); +template __attribute__((mangled_name(opttrans_double))) __kernel void opttrans(const int n, __global double* data_im, const int im_offset, const int height, const int width, const int channels, __global double* data_opt, const int opt_offset, const int optnum); + +template +__kernel void transpose(__global const T *src, __global T* dst, int width, int height, int optnum) { + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int gidyy = gidy; + int index = gidy / height; + int offset = index * width * height; + gidy = gidy % height; + if( gidx < width && gidyy < height * optnum ) + dst[offset + height * gidx + gidy] = src[offset + width * gidy + gidx]; +} +template __attribute__((mangled_name(transpose_float))) __kernel void transpose(__global const float* src, __global float* dst, const int width, const int height, int optnum); +template __attribute__((mangled_name(transpose_double))) __kernel void transpose(__global const double* src, __global double* dst, const int width, const int heighti, int optnum); + +template +__kernel void transform(__global const T *src, __global T* dst, int top_offset, int width, int height, int optnum) { + int gidx = get_global_id(0); + int index; + index = (optnum==1) ? 0: gidx % optnum; + dst = dst + top_offset; // now we point at (*top)[n] + int offset = gidx / optnum; + int i = 0; + for(i = 0; i < width; i++) + dst[(index * height + offset)* width + i] = src[gidx * width + i]; +} +template __attribute__((mangled_name(transform_float))) __kernel void transform(__global const float* src, __global float* dst, int top_offset, const int width, const int height, const int optnum); +template __attribute__((mangled_name(transform_double))) __kernel void transform(__global const double* src, __global double* dst, int top_offset, const int width, const int height, const int optnum); diff --git a/src/caffe/ocl/lrn_layer.cl b/src/caffe/ocl/lrn_layer.cl new file mode 100644 index 00000000..67eed4ae --- /dev/null +++ b/src/caffe/ocl/lrn_layer.cl @@ -0,0 +1,139 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) + out[index] = in[index] * pow(scale[index], negative_beta); +} +template __attribute__((mangled_name(LRNComputeOutput_float))) __kernel void LRNComputeOutput(const int nthreads, __global float* in, __global float* scale, const float negative_beta, __global float* out); +template __attribute__((mangled_name(LRNComputeOutput_double))) __kernel void LRNComputeOutput(const int nthreads, __global double* in, __global double* scale, const double negative_beta, __global double* out); + +template +__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + in = in + offset; + scale = scale + offset; + int head = 0; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; + T accum_scale = 0; + // fill the scale at [n, :, h, w] + // accumulate values + while (head < post_pad && head < channels) { + accum_scale += in[head * step] * in[head * step]; + ++head; + } + // both add and subtract + while (head < channels) { + accum_scale += in[head * step] * in[head * step]; + if (head - size >= 0) { + accum_scale -= in[(head - size) * step] + * in[(head - size) * step]; + } + scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; + ++head; + } + // subtract only + while (head < channels + post_pad) { + if (head - size >= 0) { + accum_scale -= in[(head - size) * step] + * in[(head - size) * step]; + } + scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size; + ++head; + } + } +} +template __attribute__((mangled_name(LRNFillScale_float))) __kernel void LRNFillScale (const int nthreads, __global float* in, const int num, const int channels, const int height, const int width, const int size, const float alpha_over_size, const float k, __global float* scale); +template __attribute__((mangled_name(LRNFillScale_double))) __kernel void LRNFillScale (const int nthreads, __global double* in, const int num, const int channels, const int height, const int width, const int size, const double alpha_over_size, const double k, __global double* scale); + +template +__kernel void LRNComputeDiff(const int nthreads, __global T* bottom_data, __global T* top_data, __global T* scale, __global T* top_diff, const int num, const int channels, const int height, const int width, const int size, const T negative_beta, const T cache_ratio, __global T* bottom_diff) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + const int w = index % width; + const int h = (index / width) % height; + const int n = index / width / height; + const int offset = (n * channels * height + h) * width + w; + const int step = height * width; + bottom_data += offset; + top_data += offset; + scale += offset; + top_diff += offset; + bottom_diff += offset; + int head = 0; + const int pre_pad = size - (size + 1) / 2; + const int post_pad = size - pre_pad - 1; + T accum_ratio = 0; + // accumulate values + while (head < post_pad && head < channels) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + ++head; + } + // both add and subtract + while (head < channels) { + accum_ratio += top_diff[head * step] * top_data[head * step] / + scale[head * step]; + if (head - size >= 0) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + } + bottom_diff[(head - post_pad) * step] = + top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + // subtract only + while (head < channels + post_pad) { + if (head - size >= 0) { + accum_ratio -= top_diff[(head - size) * step] * + top_data[(head - size) * step] / scale[(head - size) * step]; + } + bottom_diff[(head - post_pad) * step] = + top_diff[(head - post_pad) * step] + * pow(scale[(head - post_pad) * step], negative_beta) + - cache_ratio * bottom_data[(head - post_pad) * step] * accum_ratio; + ++head; + } + } +} + +template __attribute__((mangled_name(LRNComputeDiff_float))) __kernel void LRNComputeDiff(const int nthreads, __global float* bottom_data, __global float* top_data, __global float* scale, __global float* top_diff, const int num, const int channels, const int height, const int width, const int size, const float negative_beta, const float cache_ratio, __global float* bottom_diff); +template __attribute__((mangled_name(LRNComputeDiff_double))) __kernel void LRNComputeDiff(const int nthreads, __global double* bottom_data, __global double* top_data, __global double* scale, __global double* top_diff, const int num, const int channels, const int height, const int width, const int size, const double negative_beta, const double cache_ratio, __global double* bottom_diff); diff --git a/src/caffe/ocl/pooling_layer.cl b/src/caffe/ocl/pooling_layer.cl new file mode 100644 index 00000000..49a1413a --- /dev/null +++ b/src/caffe/ocl/pooling_layer.cl @@ -0,0 +1,293 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void MaxPoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* top_data, __global int* mask, __global T* top_mask) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index += tmp) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + T maxval = -FLT_MAX; + int maxidx = -1; + bottom_data = + bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + if (bottom_data[h * width + w] > maxval) { + maxidx = h * width + w; + maxval = bottom_data[maxidx]; + } + } + } + top_data[index] = maxval; + if (mask) { + mask[index] = maxidx; + } else { + top_mask[index] = maxidx; + } + } +} +template __attribute__((mangled_name(MaxPoolForward_float))) __kernel void MaxPoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width,const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* top_data, __global int* mask, __global float* top_mask); +template __attribute__((mangled_name(MaxPoolForward_double))) __kernel void MaxPoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* top_data, __global int* mask, __global double* top_mask); + +template +__kernel void AvePoolForward(const int nthreads, __global T* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global T* top_data) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index+=tmp) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height); + wend = min(wend, width); + T aveval = 0; + bottom_data = + bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + aveval += bottom_data[h * width + w]; + } + } + top_data[index] = aveval / pool_size; + } + +} +template __attribute__((mangled_name(AvePoolForward_float))) __kernel void AvePoolForward(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global float* top_data); +template __attribute__((mangled_name(AvePoolForward_double))) __kernel void AvePoolForward(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,__global double* top_data); + +template +__kernel void StoPoolForwardTrain(const int nthreads, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* rand_idx, __global T* top_data) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < nthreads; index+=tmp) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + T cumsum = 0.; + bottom_data = bottom_data + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + } + } + const float thres = rand_idx[index] * cumsum; + // Second pass: get value, and set index. + cumsum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + if (cumsum >= thres) { + rand_idx[index] = ((n * channels + c) * height + h) * width + w; + top_data[index] = bottom_data[h * width + w]; + return; + } + } + } + } +} +template __attribute__((mangled_name(StoPoolForwardTrain_float))) __kernel void StoPoolForwardTrain(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global float* idx_data, __global float* top_data); +template __attribute__((mangled_name(StoPoolForwardTrain_double))) __kernel void StoPoolForwardTrain(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* idx_data, __global double* top_data); + +template +__kernel void StoPoolForwardTest(const int count, __global T* bottom_data, const int clnum, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global T* top_data) { + int index = get_global_id(0); + int tmp = get_global_size(0); + for(index; index < count; index+=tmp) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + // We set cumsum to be 0 to avoid divide-by-zero problems T cumsum = FLT_MIN; + T cumsum = FLT_MIN; + T cumvalues = 0.; + bottom_data = bottom_data + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_data[h * width + w]; + cumvalues += bottom_data[h * width + w] * bottom_data[h * width + w]; + } + } + top_data[index] = cumvalues / cumsum;} +} +template __attribute__((mangled_name(StoPoolForwardTest_float))) __kernel void StoPoolForwardTest(const int nthreads, __global float* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,__global float* top_data); +template __attribute__((mangled_name(StoPoolForwardTest_double))) __kernel void StoPoolForwardTest(const int nthreads, __global double* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, __global double* top_data); + +template +__kernel void MaxPoolBackward(const int nthreads, __global T* top_diff, + __global int* mask, __global T* top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, __global T* const bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total) { + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = + (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; + const int phend = min((h + pad_h) / stride_h + 1, pooled_height); + const int pwstart = + (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; + const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); + T gradient = 0; + const int offset = (n * channels + c) * pooled_height * pooled_width; + top_diff += offset; + if (mask) { + mask = mask + offset; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (mask[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff[ph * pooled_width + pw]; + } + } + } + } else { + top_mask = top_mask + offset; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (top_mask[ph * pooled_width + pw] == h * width + w) { + gradient += top_diff[ph * pooled_width + pw]; + } + } + } + } + bottom_diff[index] = gradient; + } +} +template __attribute__((mangled_name(MaxPoolBackward_float))) __kernel void MaxPoolBackward(const int nthreads, __global float* const top_diff, __global int* const mask, __global float* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); +template __attribute__((mangled_name(MaxPoolBackward_double))) __kernel void MaxPoolBackward(const int nthreads, __global double* top_diff, __global int* const mask, __global double* const top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); + +template +__kernel void AvePoolBackward(const int nthreads, __global T* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global T* const bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total) { + int w = index % width + pad_w; + int h = (index / width) % height + pad_h; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + T gradient = 0; + top_diff += (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (hend - hstart) * (wend - wstart); + gradient += top_diff[ph * pooled_width + pw] / pool_size; + } + } + bottom_diff[index] = gradient; + } +} + +template __attribute__((mangled_name(AvePoolBackward_float))) __kernel void AvePoolBackward(const int nthreads, __global float* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global float* bottom_diff); +template __attribute__((mangled_name(AvePoolBackward_double))) __kernel void AvePoolBackward(const int nthreads, __global double* top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, __global double* bottom_diff); + +template +__kernel void StoPoolBackward(const int nthreads, + __global Dtype* rand_idx, __global Dtype* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global Dtype* bottom_diff) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < nthreads; index += total) { + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + Dtype gradient = 0; + rand_idx = + rand_idx + (n * channels + c) * pooled_height * pooled_width; + top_diff = + top_diff + (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + gradient += top_diff[ph * pooled_width + pw] * + (index == static_cast(rand_idx[ph * pooled_width + pw])); + } + } + bottom_diff[index] = gradient; + + } +} +template __attribute__ ((mangled_name(StoPoolBackward_float))) __kernel void StoPoolBackward(const int nthreads, + __global float* rand_idx, __global float* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global float* bottom_diff); +template __attribute__ ((mangled_name(StoPoolBackward_double))) __kernel void StoPoolBackward(const int nthreads, + __global double* rand_idx, __global double* top_diff, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, __global double* bottom_diff); diff --git a/src/caffe/ocl/prelu_layer.cl b/src/caffe/ocl/prelu_layer.cl new file mode 100644 index 00000000..caff18b9 --- /dev/null +++ b/src/caffe/ocl/prelu_layer.cl @@ -0,0 +1,60 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void PReLUForward(const int count, const int channels, const int dim, __global T* in, __global T* out, __global T* slope_data, const int div_factor) { + int index = get_global_id(0); + if(index < count) { + int c = (index / dim) % channels / div_factor; + out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; + } +} +template __attribute__ ((mangled_name(PReLUForward_float))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global float* in, __global float* out, __global float* slope_data, const int div_factor); +template __attribute__ ((mangled_name(PReLUForward_double))) __kernel void PReLUForward(const int count, const int channels, const int dim, __global double* in, __global double* out, __global double* slope_data, const int div_factor); + +template +__kernel void PReLUBackward(const int count, const int channels, const int dim, __global T* in_diff, __global T* in_data, __global T* out_diff, __global T* slope_data, const int div_factor) { + int index = get_global_id(0); + if(index < count) { + int c = (index / dim) % channels / div_factor; + out_diff[index] = in_diff[index] * ((in_data[index] > 0) + + (in_data[index] <= 0) * slope_data[c]); + } +} +template __attribute__ ((mangled_name(PReLUBackward_float))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global float* in_diff, __global float* in_data, __global float* out_diff, __global float* slope_data, const int div_factor); +template __attribute__ ((mangled_name(PReLUBackward_double))) __kernel void PReLUBackward(const int count, const int channels, const int dim, __global double* in_diff, __global double* in_data, __global double* out_diff, __global double* slope_data, const int div_factor); + +template +__kernel void PReLUParamBackward(const int count, __global T* in_diff, const int offset_in_diff, __global T* in_data, const int offset_in_data, __global T* out_diff) { + int index = get_global_id(0); + if(index < count) { + in_diff += offset_in_diff; + in_data += offset_in_data; + out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0); + } +} +template __attribute__ ((mangled_name(PReLUParamBackward_float))) __kernel void PReLUParamBackward(const int count, __global float* in_diff, const int offset_in_diff, __global float* in_data, const int offset_in_data, __global float* out_diff); +template __attribute__ ((mangled_name(PReLUParamBackward_double))) __kernel void PReLUParamBackward(const int count, __global double* in_diff, const int offset_in_diff, __global double* in_data, const int offset_in_data, __global double* out_diff); diff --git a/src/caffe/ocl/random.cl b/src/caffe/ocl/random.cl new file mode 100644 index 00000000..468240f0 --- /dev/null +++ b/src/caffe/ocl/random.cl @@ -0,0 +1,960 @@ + +#pragma OPENCL EXTENSION cl_amd_printf : enable + +//Note: random generator has two parts +//first part: the open sourced threefy random generator kernel from DE Shaw Research +//second part. we wrapp the kernel up to generate uniform, bernoulli and gaussion distribution generators. + +//begin: the open sourced random generator from DE Shaw Research +//https://www.deshawresearch.com/resources_random123.html +typedef uint uint32_t; + +struct r123array4x32 { + uint32_t v[4]; +}; + +enum r123_enum_threefry32x4 { + R_32x4_0_0 = 10, + R_32x4_0_1 = 26, + R_32x4_1_0 = 11, + R_32x4_1_1 = 21, + R_32x4_2_0 = 13, + R_32x4_2_1 = 27, + R_32x4_3_0 = 23, + R_32x4_3_1 = 5, + R_32x4_4_0 = 6, + R_32x4_4_1 = 20, + R_32x4_5_0 = 17, + R_32x4_5_1 = 11, + R_32x4_6_0 = 25, + R_32x4_6_1 = 10, + R_32x4_7_0 = 18, + R_32x4_7_1 = 20 +}; + +inline uint32_t RotL_32(uint32_t x, unsigned int N) + __attribute__((always_inline)); +inline uint32_t RotL_32(uint32_t x, unsigned int N) { + return (x << (N & 31)) | (x >> ((32 - N) & 31)); +} + +typedef struct r123array4x32 threefry4x32_ctr_t; +typedef struct r123array4x32 threefry4x32_key_t; +typedef struct r123array4x32 threefry4x32_ukey_t; + +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, + threefry4x32_ctr_t in, threefry4x32_key_t k) __attribute__((always_inline)); +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, + threefry4x32_ctr_t in, threefry4x32_key_t k) { + threefry4x32_ctr_t X; + uint32_t ks[4 + 1]; + int i; + ks[4] = 0x1BD11BDA; + /* + for (i = 0; i < 4; i++) + { + ks[i] = k.v[i]; + X.v[i] = in.v[i]; + ks[4] ^= k.v[i]; + }*/ + { + ks[0] = k.v[0]; + X.v[0] = in.v[0]; + ks[4] ^= k.v[0]; + + ks[1] = k.v[1]; + X.v[1] = in.v[1]; + ks[4] ^= k.v[1]; + + ks[2] = k.v[2]; + X.v[2] = in.v[2]; + ks[4] ^= k.v[2]; + + ks[3] = k.v[3]; + X.v[3] = in.v[3]; + ks[4] ^= k.v[3]; + } + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + if (Nrounds > 0) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 1) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 2) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 3) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 3) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 1; + } + if (Nrounds > 4) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 5) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 6) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 7) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 7) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 2; + } + if (Nrounds > 8) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 9) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 10) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 11) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 11) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 3; + } + if (Nrounds > 12) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 13) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 14) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 15) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 15) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 4; + } + if (Nrounds > 16) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 17) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 18) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 19) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 19) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 5; + } + if (Nrounds > 20) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 21) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 22) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 23) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 23) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 6; + } + if (Nrounds > 24) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 25) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 26) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 27) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 27) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 7; + } + if (Nrounds > 28) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 29) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 30) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 31) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 31) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 8; + } + if (Nrounds > 32) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 33) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 34) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 35) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 35) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 9; + } + if (Nrounds > 36) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 37) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 38) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 39) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 39) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 10; + } + if (Nrounds > 40) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 41) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 42) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 43) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 43) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 11; + } + if (Nrounds > 44) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 45) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 46) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 47) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 47) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 12; + } + if (Nrounds > 48) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 49) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 50) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 51) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 51) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 13; + } + if (Nrounds > 52) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 53) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 54) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 55) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 55) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 14; + } + if (Nrounds > 56) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 57) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 58) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 59) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 59) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 15; + } + if (Nrounds > 60) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 61) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 62) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 63) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 63) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 16; + } + if (Nrounds > 64) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 65) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 66) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 67) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 67) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 17; + } + if (Nrounds > 68) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 69) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 70) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + if (Nrounds > 71) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 71) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 18; + } + return X; +} +//end: the open sourced random generator from DE Shaw Research + +template +__kernel void PRNG_threefry4x32_bernoulli( + __global uint4 *randomnumber, + threefry4x32_ctr_t ctr_i, + T inf, + T sup, + T threshold, + uint nrounds, + uint numrandom +) { + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + uint4 frnd; + + frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) > threshold? 1 : 0; + + randomnumber[gdx] = frnd; + } +} + +template __attribute__((mangled_name(RNGBernoulli_float))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, float threshold, uint nrounds, uint numrandonm); + +template __attribute__((mangled_name(RNGBernoulli_double))) __kernel void PRNG_threefry4x32_bernoulli(__global uint4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, double threshold, uint nrounds, uint numrandonm); + +//end of the looooooong gpu_random_generator kernel + +//We wrapp the kernel up to generate uniform, bernoulli and gaussion distribution generators. + +template +__kernel void PRNG_threefry4x32_uniform( + __global float4 *randomnumber, + threefry4x32_ctr_t ctr_i, + T inf, + T sup, + uint nrounds, + uint numrandom +) { + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + float4 frnd; + frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ); + frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ); + frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ); + frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ); + randomnumber[gdx] = frnd; + } +} + +template __attribute__((mangled_name(RNGUniform_float))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float inf, float sup, uint nrounds, uint numrandonm); + +template __attribute__((mangled_name(RNGUniform_double))) __kernel void PRNG_threefry4x32_uniform(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double inf, double sup, uint nrounds, uint numrandonm); + +__kernel void PRNG_threefry4x32_uint_uniform( + __global uint4 *randomnumber, + threefry4x32_ctr_t ctr_i, + uint inf, + uint sup, + uint nrounds, + uint numrandom +) { + size_t gdx = get_global_id(0); + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) + { + random4 = threefry4x32_R(nrounds, ctr, ukey); + uint4 frnd; + frnd.x = random4.v[0] % (sup - inf) + inf; + frnd.y = random4.v[1] % (sup - inf) + inf; + frnd.z = random4.v[2] % (sup - inf) + inf; + frnd.w = random4.v[3] % (sup - inf) + inf; + randomnumber[gdx] = frnd; + } +} + +template +__kernel void PRNG_threefry4x32_gaussian( + __global float4 *randomnumber, + threefry4x32_ctr_t ctr_i, + T E, + T V, + uint nrounds, + uint numrandom +) { + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey1, ukey2; + + ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx; + ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0; + + threefry4x32_ctr_t random1, random2; + + if ( gdx < numrandom ) + { + random1 = threefry4x32_R(nrounds, ctr, ukey1); + random2 = threefry4x32_R(nrounds, ctr, ukey2); + float4 frnd1; + + float r1 = (((float)random1.v[0]) / r); // generate a random sequence of uniform distribution + float r2 = (((float)random2.v[0]) / r); + float r3 = (((float)random1.v[1]) / r); + float r4 = (((float)random2.v[1]) / r); + float r5 = (((float)random1.v[2]) / r); + float r6 = (((float)random2.v[2]) / r); + float r7 = (((float)random1.v[3]) / r); + float r8 = (((float)random2.v[3]) / r); + + if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0) { + r2 += 0.0001; + r4 += 0.0001; + r6 += 0.0001; + r8 += 0.0001; + } + + frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E; // return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + + randomnumber[gdx] = frnd1; + } +} + +template __attribute__((mangled_name(RNGGaussian_float))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, float E, float V, uint nrounds, uint numrandonm); + +template __attribute__((mangled_name(RNGGaussian_double))) __kernel void PRNG_threefry4x32_gaussian(__global float4 *randomnumber, threefry4x32_ctr_t ctr_i, double E, double V, uint nrounds, uint numrandonm); + diff --git a/src/caffe/ocl/relu_layer.cl b/src/caffe/ocl/relu_layer.cl new file mode 100644 index 00000000..e39aa426 --- /dev/null +++ b/src/caffe/ocl/relu_layer.cl @@ -0,0 +1,46 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void ReLUForward(const int count, __global T* in, __global T* out, T negative_slope) { + int index = get_global_id(0); + if(index < count) + out[index] = in[index] > 0? in[index]:in[index]*negative_slope; +} + +template __attribute__ ((mangled_name(ReLUForward_float))) __kernel void ReLUForward(const int count, __global float* in, __global float* out, float negative_slope); +template __attribute__ ((mangled_name(ReLUForward_double))) __kernel void ReLUForward(const int count, __global double* in, __global double* out, double negative_slope); + +template +__kernel void ReLUBackward(const int count, __global T* in_diff, __global T* in_data,__global T* out_diff,T negative_slope) { + int index = get_global_id(0); + if(index < count) { + out_diff[index] = in_diff[index] * ((in_data[index] > 0) + (in_data[index] <= 0) * negative_slope); + } +} + +template __attribute__ ((mangled_name(ReLUBackward_float))) __kernel void ReLUBackward(const int count, __global float* in_diff, __global float* in_data, __global float* out_diff, float negative_slope); +template __attribute__ ((mangled_name(ReLUBackward_double))) __kernel void ReLUBackward(const int count, __global double* in_diff, __global double* in_data, __global double* out_diff, double negative_slope); diff --git a/src/caffe/ocl/sigmoid_layer.cl b/src/caffe/ocl/sigmoid_layer.cl new file mode 100644 index 00000000..ac0ef9a9 --- /dev/null +++ b/src/caffe/ocl/sigmoid_layer.cl @@ -0,0 +1,46 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void SigmoidForward(const int count, __global T* in, __global T* out) { + int index = get_global_id(0); + if(index < count) + out[index] = 1. / (1. + exp(-in[index])); +} + +template __attribute__ ((mangled_name(SigmoidForward_float))) __kernel void SigmoidForward(const int count, __global float* in, __global float* out); +template __attribute__ ((mangled_name(SigmoidForward_double))) __kernel void SigmoidForward(const int count, __global double* in, __global double* out); + +template +__kernel void SigmoidBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) { + int index = get_global_id(0); + const T sigmoid_x = out_data[index]; + if(index < count) + out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); +} + +template __attribute__ ((mangled_name(SigmoidBackward_float))) __kernel void SigmoidBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff); +template __attribute__ ((mangled_name(SigmoidBackward_double))) __kernel void SigmoidBackward(const int count, __global double* in_diff, __global double* out_data, __global double* out_diff); diff --git a/src/caffe/ocl/slice_layer.cl b/src/caffe/ocl/slice_layer.cl new file mode 100644 index 00000000..26c6bb34 --- /dev/null +++ b/src/caffe/ocl/slice_layer.cl @@ -0,0 +1,28 @@ +template +__kernel void Slice(const int nthreads, __global const Dtype* in_data, + const int forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, __global Dtype* out_data) { + int index = get_global_id(0); + if (index < nthreads) { + const int total_slice_size = slice_size * top_slice_axis; + const int slice_num = index / total_slice_size; + const int slice_index = index % total_slice_size; + const int bottom_index = slice_index + + (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size; + if (forward) { + out_data[index] = in_data[bottom_index]; + } else { + out_data[bottom_index] = in_data[index]; + } + } +} + +template __attribute__ ((mangled_name(Slice_float))) __kernel void Slice(const int nthreads, __global const float* in_data, + const int forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, __global float* out_data); +template __attribute__ ((mangled_name(Slice_double))) __kernel void Slice(const int nthreads, __global const double* in_data, + const int forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, __global double* out_data); diff --git a/src/caffe/ocl/softmax_layer.cl b/src/caffe/ocl/softmax_layer.cl new file mode 100644 index 00000000..207f0058 --- /dev/null +++ b/src/caffe/ocl/softmax_layer.cl @@ -0,0 +1,171 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void softmax(__global T* prob_data, __global T* loss, __global T* label, int num, int dim, __local T* resultScratch) { + + int gid = get_global_id(0); + int size = get_global_size(0); + + resultScratch[gid] = 0.0; + for(int i = gid; i < num; i += size) { + resultScratch[gid] += -log(prob_data[i * dim + static_cast(label[i])]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(gid < 128) + resultScratch[gid] += resultScratch[gid + 128]; + barrier(CLK_LOCAL_MEM_FENCE); + if(gid < 64) + resultScratch[gid] += resultScratch[gid + 64]; + if(gid < 32) + resultScratch[gid] += resultScratch[gid + 32]; + if(gid < 16) + resultScratch[gid] += resultScratch[gid + 16]; + if(gid < 8) + resultScratch[gid] += resultScratch[gid + 8]; + if(gid < 4) + resultScratch[gid] += resultScratch[gid + 4]; + if(gid < 2) + resultScratch[gid] += resultScratch[gid + 2]; + if(gid < 1) { + resultScratch[gid] += resultScratch[gid + 1]; + loss[0] = resultScratch[gid]; + } +} +template __attribute__ ((mangled_name(softmax_float))) __kernel void softmax (__global float* prob_data, __global float* loss, __global float* label, int num, int dim, __local float* resultScratch); +template __attribute__ ((mangled_name(softmax_double))) __kernel void softmax (__global double* prob_data, __global double* loss, __global double* label, int num, int dim, __local double* resultScratch); + +template +__kernel void softmax_div (const int num, const int dim, __global T* scale, __global T* data) { + //printf("softmax_div\n"); + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < num*dim; index += total) { + int n = index / dim; + data[index] /= scale[n]; + } +} + +template __attribute__ ((mangled_name(softmax_div_float))) __kernel void softmax_div (const int num, const int dim, __global float* scale, __global float* data); +template __attribute__ ((mangled_name(softmax_div_double))) __kernel void softmax_div (const int num, const int dim, __global double* scale, __global double* data); + +template +__kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T maxval = -FLT_MAX; + for (int c = 0; c < channels; ++c) { + maxval = max(data[(n * channels + c) * spatial_dim + s], maxval); + } + out[index] = maxval; + } +} + +template __attribute__ ((mangled_name(kernel_channel_max_float))) __kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const float* data, __global float* out); +template __attribute__ ((mangled_name(kernel_channel_max_double))) __kernel void kernel_channel_max(const int num, const int channels, + const int spatial_dim, __global const double* data, __global double* out); + +template +__kernel void kernel_channel_subtract(const int count, + const int num, const int channels, + const int spatial_dim, __global const T* channel_max, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] -= channel_max[n * spatial_dim + s]; + } +} +template __attribute__ ((mangled_name(kernel_channel_subtract_float))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const float* channel_max, __global float* data); +template __attribute__ ((mangled_name(kernel_channel_subtract_double))) __kernel void kernel_channel_subtract(const int count, const int num, const int channels, const int spatial_dim, __global const double* channel_max, __global double* data); + +template +__kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const T* data, __global T* channel_sum) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T sum = 0; + for (int c = 0; c < channels; ++c) { + sum += data[(n * channels + c) * spatial_dim + s]; + } + channel_sum[index] = sum; + } +} + +template __attribute__ ((mangled_name(kernel_channel_sum_float))) __kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const float* data, __global float* channel_sum); +template __attribute__ ((mangled_name(kernel_channel_sum_double))) __kernel void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, __global const double* data, __global double* channel_sum); + +template +__kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const T* channel_sum, __global T* data) { + int index = get_global_id(0); + if(index < count) { + int n = index / channels / spatial_dim; + int s = index % spatial_dim; + data[index] /= channel_sum[n * spatial_dim + s]; + } +} + +template __attribute__ ((mangled_name(kernel_channel_div_float))) __kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const float* channel_sum, __global float* data); +template __attribute__ ((mangled_name(kernel_channel_div_double))) __kernel void kernel_channel_div(const int count, + const int num, const int channels, + const int spatial_dim, __global const double* channel_sum, __global double* data); + +template +__kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const T* data_1, __global const T* data_2, + __global T* channel_dot) { + int index = get_global_id(0); + if(index < num * spatial_dim) { + int n = index / spatial_dim; + int s = index % spatial_dim; + T dot = 0; + for (int c = 0; c < channels; ++c) { + dot += (data_1[(n * channels + c) * spatial_dim + s] + * data_2[(n * channels + c) * spatial_dim + s]); + } + channel_dot[index] = dot; + } +} + +template __attribute__ ((mangled_name(kernel_channel_dot_float))) __kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const float* data_1, __global const float* data_2, + __global float* channel_dot); +template __attribute__ ((mangled_name(kernel_channel_dot_double))) __kernel void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, __global const double* data_1, __global const double* data_2, + __global double* channel_dot); diff --git a/src/caffe/ocl/softmaxwithloss_layer.cl b/src/caffe/ocl/softmaxwithloss_layer.cl new file mode 100644 index 00000000..731f660c --- /dev/null +++ b/src/caffe/ocl/softmaxwithloss_layer.cl @@ -0,0 +1,103 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void SoftmaxLossForwardGPU(const int nthreads, + __global T* prob_data, __global T* label,__global T* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global T* counts) { + int index = get_global_id(0); + if(index < nthreads) { + const int n = index / spatial_dim; + const int s = index % spatial_dim; + const int label_value = static_cast(label[n * spatial_dim + s]); + if (has_ignore_label_ && label_value == ignore_label_) { + loss[index] = 0; + counts[index] = 0; + } else { + loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s], + T(FLT_MIN))); + counts[index] = 1; + } + } +} + +template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_float))) __kernel void SoftmaxLossForwardGPU(int nthreads, + __global float* prob_data, __global float* label,__global float* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global float* counts); +template __attribute__ ((mangled_name(SoftmaxLossForwardGPU_double))) __kernel void SoftmaxLossForwardGPU(int nthreads, + __global double* prob_data, __global double* label,__global double* loss, + int num, int dim, int spatial_dim, + bool has_ignore_label_, int ignore_label_, + __global double* counts); + +template +__kernel void SoftmaxLossBackwardGPU(int nthreads, __global T* top, + __global T* label,__global T* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, T* counts) { + const int channels = dim / spatial_dim; + int index = get_global_id(0); + if(index < nthreads) { + const int n = index / spatial_dim; + const int s = index % spatial_dim; + const int label_value = static_cast(label[n * spatial_dim + s]); + + if (has_ignore_label_ && label_value == ignore_label_) { + for (int c = 0; c < channels; ++c) { + bottom_diff[n * dim + c * spatial_dim + s] = 0; + } + counts[index] = 0; + } else { + bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; + counts[index] = 1; + } + } +} +template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_float))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global float* top, + __global float* label,__global float* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, float* counts); + +template __attribute__ ((mangled_name(SoftmaxLossBackwardGPU_double))) __kernel void SoftmaxLossBackwardGPU(int nthreads, __global double* top, + __global double* label,__global double* bottom_diff, int num, int dim, + int spatial_dim, bool has_ignore_label_, + int ignore_label_, double* counts); + +template +__kernel void scal (const int num, const T alpha, __global T* data) { + int index = get_global_id(0); + int total = get_global_size(0); + for(index; index < num; index += total) { + data[index] = data[index] * alpha; + } +} + +template __attribute__ ((mangled_name(scal_float))) __kernel void scal (const int num, const float alpha, __global float* data); +template __attribute__ ((mangled_name(scal_double))) __kernel void scal (const int num, const double alpha, __global double* data); diff --git a/src/caffe/ocl/tanh_layer.cl b/src/caffe/ocl/tanh_layer.cl new file mode 100644 index 00000000..900f11ea --- /dev/null +++ b/src/caffe/ocl/tanh_layer.cl @@ -0,0 +1,46 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void TanHForward(const int count, __global T* in, __global T* out) { + int index = get_global_id(0); + if(index < count) + out[index] =tanh(in[index]); +} + +template __attribute__ ((mangled_name(TanHForward_float))) __kernel void TanHForward(const int count, __global float* in, __global float* out); +template __attribute__ ((mangled_name(TanHForward_double))) __kernel void TanHForward(const int count, __global double* in, __global double* out); + +template +__kernel void TanHBackward(const int count, __global T* in_diff, __global T* out_data,__global T* out_diff) { + int index = get_global_id(0); + const T tanhx = out_data[index]; + if(index < count) + out_diff[index] = in_diff[index] * ( 1- tanhx * tanhx); +} + +template __attribute__ ((mangled_name(TanHBackward_float))) __kernel void TanHBackward(const int count, __global float* in_diff, __global float* out_data, __global float* out_diff); +template __attribute__ ((mangled_name(TanHBackward_double))) __kernel void TanHBackward(const int count, __global double* in_diff, __global double* out_data, __global double* out_diff); diff --git a/src/caffe/ocl/threshold_layer.cl b/src/caffe/ocl/threshold_layer.cl new file mode 100644 index 00000000..679dbf29 --- /dev/null +++ b/src/caffe/ocl/threshold_layer.cl @@ -0,0 +1,36 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +template +__kernel void ThresholdForward(const int count, const T threshold, __global T* in, __global T* out) { + int index = get_global_id(0); + if(index < count) + out[index] =in[index] > threshold ? 1 : 0; +} + +template __attribute__ ((mangled_name(ThresholdForward_float))) __kernel void ThresholdForward(const int count, const float threshold, __global float* in, __global float* out); +template __attribute__ ((mangled_name(ThresholdForward_double))) __kernel void ThresholdForward(const int count, const double threshold, __global double* in, __global double* out); + diff --git a/src/caffe/ocl/util.cl b/src/caffe/ocl/util.cl new file mode 100644 index 00000000..222e4ed9 --- /dev/null +++ b/src/caffe/ocl/util.cl @@ -0,0 +1,268 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +#pragma OPENCL EXTENSION cl_amd_printf : enable + +template +__kernel void OCL_memset(__global T* buffer, const T value, const int size, const int buf_offset) { + int gdx = get_global_id(0); + buffer += buf_offset; + if(gdx < size) { + buffer[gdx] = value; + } +} + +template __attribute__((mangled_name(oclmem_int))) __kernel void OCL_memset(__global int* buffer, const int value, const int size, const int buf_offset); +template __attribute__((mangled_name(oclmem_float))) __kernel void OCL_memset(__global float* buffer, const float value, const int size, const int buf_offset); +template __attribute__((mangled_name(oclmem_double))) __kernel void OCL_memset(__global double* buffer, const double value, const int size, const int buf_offset); + +__kernel void OCL_memset2(__global int* buffer, const int value, const int size) { + int gdx = get_global_id(0); + if(gdx < size) { + buffer[gdx] = value; + } +} + +template +__kernel void caffe_gpu_sign(const int N, __global T* X, __global T* Y) { + int gdx = get_global_id(0); + if(gdx < N) { + Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0)); + } +} + +template __attribute__((mangled_name(caffe_gpu_sign_float))) __kernel void caffe_gpu_sign(const int N, __global float* X, __global float* Y); +template __attribute__((mangled_name(caffe_gpu_sign_double))) __kernel void caffe_gpu_sign(const int N, __global double* X, __global double* Y); + +template +__kernel void caffe_gpu_sgnbit(const int N, __global T* X, __global T* Y) { + int gdx = get_global_id(0); + if(gdx < N) { + Y[gdx] =(X[gdx] < 0.0); + } +} + +template __attribute__((mangled_name(caffe_gpu_sgnbit_float))) __kernel void caffe_gpu_sgnbit(const int N, __global float* X, __global float* Y); +template __attribute__((mangled_name(caffe_gpu_sgnbit_double))) __kernel void caffe_gpu_sgnbit(const int N, __global double* X, __global double* Y); + +template +__kernel void caffe_gpu_sign_with_offset(const int N, __global T* X, const int offx, __global T* Y, const int offy) { + X += offx; + Y += offy; + int gdx = get_global_id(0); + if(gdx < N) { + Y[gdx] =((X[gdx]>0.0)-(X[gdx]<0.0)); + } +} +template __attribute__((mangled_name(caffe_gpu_sign_with_offset_float))) __kernel void caffe_gpu_sign_with_offset(const int N, __global float* X, const int offx, __global float* Y, const int offy); +template __attribute__((mangled_name(caffe_gpu_sign_with_offset_double))) __kernel void caffe_gpu_sign_with_offset(const int N, __global double* X, const int offx, __global double* Y, const int offy); + +template +__kernel void caffe_gpu_abs(const int n, __global T* a, __global T* y) { + int index = get_global_id(0); + if(index < n) { + y[index] = fabs(a[index]); + } +} +template __attribute__((mangled_name(caffe_gpu_abs_float))) __kernel void caffe_gpu_abs(const int n, __global float* a, __global float* Y); +template __attribute__((mangled_name(caffe_gpu_abs_double))) __kernel void caffe_gpu_abs(const int n, __global double* a, __global double* Y); + +template +__kernel void get_max(const int num, const int dim, __global T* data, __global T* out) { + int index = get_global_id(0); + if (index < num) { + T maxval = -FLT_MAX; + for (int i = 0; i < dim; i++) + maxval = max( data[index*dim + i], maxval ); + out[index] = maxval; + } +} + +template __attribute__ ((mangled_name(get_max_float))) __kernel void get_max(const int num, const int dim, __global float* data, __global float* out); +template __attribute__ ((mangled_name(get_max_double))) __kernel void get_max(const int num, const int dim, __global double* data, __global double* out); + +template +__kernel void exp (const int num, __global T* data, __global T* out) { + int index = get_global_id(0); + if (index < num) + out[index] = exp(data[index]); +} + +template __attribute__ ((mangled_name(exp_float))) __kernel void exp (const int num, __global float* data, __global float* out); +template __attribute__ ((mangled_name(exp_double))) __kernel void exp (const int num, __global double* data, __global double* out); + +template +__kernel void kernel_sub(const int count, __global const T* a, __global const T* b, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] - b[index]; + } +} + +template __attribute__ ((mangled_name(kernel_sub_float))) __kernel void kernel_sub(const int count, __global const float* a, __global const float* b, __global float* out); +template __attribute__ ((mangled_name(kernel_sub_double))) __kernel void kernel_sub(const int count, __global const double* a, __global const double* b, __global double* out); + +template +__kernel void kernel_add(const int count, __global const T* a, __global const T* b, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] + b[index]; + } +} + +template __attribute__ ((mangled_name(kernel_add_float))) __kernel void kernel_add(const int count, __global const float* a, __global const float* b, __global float* out); +template __attribute__ ((mangled_name(kernel_add_double))) __kernel void kernel_add(const int count, __global const double* a, __global const double* b, __global double* out); + +template +__kernel void kernel_div(const int count, __global const T* a, __global const T* b, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] / b[index]; + } +} + +template __attribute__ ((mangled_name(kernel_div_float))) __kernel void kernel_div(const int count, __global const float* a, __global const float* b, __global float* out); +template __attribute__ ((mangled_name(kernel_div_double))) __kernel void kernel_div(const int count, __global const double* a, __global const double* b, __global double* out); + +template +__kernel void kernel_mul(const int count, __global const T* a, __global const T* b, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = a[index] * b[index]; + } +} + +template __attribute__ ((mangled_name(kernel_mul_float))) __kernel void kernel_mul(const int count, __global const float* a, __global const float* b, __global float* out); +template __attribute__ ((mangled_name(kernel_mul_double))) __kernel void kernel_mul(const int count, __global const double* a, __global const double* b, __global double* out); + +template +__kernel void kernel_powx(const int count, __global const T* data, const T alpha, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = pow(data[index], alpha); + } +} + +template __attribute__ ((mangled_name(kernel_powx_float))) __kernel void kernel_powx(const int count, __global const float* data, const float alpha, __global float* out); +template __attribute__ ((mangled_name(kernel_powx_double))) __kernel void kernel_powx(const int count, __global const double* data, const double alpha, __global double* out); + +template +__kernel void kernel_exp(const int count, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = exp(data[index]); + } +} + +template __attribute__ ((mangled_name(kernel_exp_float))) __kernel void kernel_exp(const int count, __global const float* data, __global float* out); +template __attribute__ ((mangled_name(kernel_exp_double))) __kernel void kernel_exp(const int count, __global const double* data, __global double* out); + +template +__kernel void kernel_add_scalar(const int count, const T data, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = out[index] + data; + } +} + +template __attribute__ ((mangled_name(kernel_add_scalar_float))) __kernel void kernel_add_scalar(const int count, const float data, __global float* out); +template __attribute__ ((mangled_name(kernel_add_scalar_double))) __kernel void kernel_add_scalar(const int count, const double data, __global double* out); + +template +__kernel void kernel_log(const int count, __global const T* data, __global T* out) { + int index = get_global_id(0); + if(index < count) { + out[index] = log(data[index]); + } +} + +template __attribute__ ((mangled_name(kernel_log_float))) __kernel void kernel_log(const int count, __global const float* data, __global float* out); +template __attribute__ ((mangled_name(kernel_log_double))) __kernel void kernel_log(const int count, __global const double* data, __global double* out); + +template +__kernel void diff (const int num, const int dim, __global T* data, __global T* label) { + int index = get_global_id(0); + int total = get_global_size(0); + int offset; + for(index; index < num; index += total) { + offset = (int) label[index]; + data[index * dim + offset] -= 1; + } +} + +template __attribute__ ((mangled_name(diff_float))) __kernel void diff (const int num, const int dim, __global float* data, __global float* label); +template __attribute__ ((mangled_name(diff_double))) __kernel void diff (const int num, const int dim, __global double* data, __global double* label); + +template +__kernel void div (const int n, __global const T* a, __global const T* b, __global T* y) { + int index = get_global_id(0); + if (index < n) + y[index] = a[index] / b[index]; +} + +template __attribute__ ((mangled_name(div_float))) __kernel void div (const int n, __global const float* a, __global const float* b, __global float* y); +//template __attribute__ ((mangled_name(div_double))) __kernel void div (const int n, __global const double* a, __global const double* b, __global double* y); + +template +__kernel void add_scalar (const int n, const T alpha, __global T* y) { + int index = get_global_id(0); + if (index < n) + y[index] += alpha; +} + +template __attribute__ ((mangled_name(add_scalar_float))) __kernel void add_scalar (const int n, const float alpha, __global float* y); +template __attribute__ ((mangled_name(add_scalar_double))) __kernel void add_scalar (const int n, const double alpha, __global double* y); + +template +__kernel void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) { + int index = get_global_id(0); + if (index < n) + y[index] = in1[index] + in2[index]; +} +template __attribute__ ((mangled_name(caffe_gpu_add_float))) __kernel void caffe_gpu_add(const int n, const float* in1, const float* in2, float* y); +template __attribute__ ((mangled_name(caffe_gpu_add_double))) __kernel void caffe_gpu_add(const int n, const double* in1, const double* in2, double* y); + +template +__kernel void element_mul (const int n, __global const T* a, __global const T* b, __global T* y) { + int index = get_global_id(0); + if (index < n) + y[index] = a[index] * b[index]; +} + +template __attribute__ ((mangled_name(element_mul_float))) __kernel void element_mul (const int n, __global const float* a, __global const float* b, __global float* y); +template __attribute__ ((mangled_name(element_mul_double))) __kernel void element_mul (const int n,__global const double* a, __global const double* b, __global double* y); + +template +__kernel void powx (const int n, __global const T* a, const T alpha, __global T* y) { + int index = get_global_id(0); + if (index < n) +// y[index] = a[index] + alpha; + y[index] = pow(a[index], alpha); +} + +template __attribute__ ((mangled_name(powx_float))) __kernel void powx (const int n, __global const float* a, const float alpha, __global float* y); +template __attribute__ ((mangled_name(powx_double))) __kernel void powx (const int n, __global const double* a, const double alpha, __global double* y); + diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index aabe0ede..b9ed1050 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -10,7 +10,7 @@ #include "caffe/util/io.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/util/upgrade_proto.hpp" - +#include "caffe/util/ocl_wrapper.hpp" namespace caffe { template @@ -30,9 +30,10 @@ Solver::Solver(const string& param_file) template void Solver::Init(const SolverParameter& param) { LOG(INFO) << "Initializing solver from parameters: " << std::endl - << param.DebugString(); + << param.DebugString(); param_ = param; CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; + if (param_.random_seed() >= 0) { Caffe::set_random_seed(param_.random_seed()); } @@ -46,8 +47,8 @@ void Solver::Init(const SolverParameter& param) { template void Solver::InitTrainNet() { - const int num_train_nets = param_.has_net() + param_.has_net_param() + - param_.has_train_net() + param_.has_train_net_param(); + const int num_train_nets = param_.has_net() + param_.has_net_param() + + param_.has_train_net() + param_.has_train_net_param(); const string& field_names = "net, net_param, train_net, train_net_param"; CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net " << "using one of these fields: " << field_names; @@ -59,7 +60,7 @@ void Solver::InitTrainNet() { net_param.CopyFrom(param_.train_net_param()); } else if (param_.has_train_net()) { LOG(INFO) << "Creating training net from train_net file: " - << param_.train_net(); + << param_.train_net(); ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param); } if (param_.has_net_param()) { @@ -93,11 +94,11 @@ void Solver::InitTestNets() { const int num_test_net_files = param_.test_net_size(); const int num_test_nets = num_test_net_params + num_test_net_files; if (num_generic_nets) { - CHECK_GE(param_.test_iter_size(), num_test_nets) - << "test_iter must be specified for each test network."; + CHECK_GE(param_.test_iter_size(), num_test_nets) + << "test_iter must be specified for each test network."; } else { - CHECK_EQ(param_.test_iter_size(), num_test_nets) - << "test_iter must be specified for each test network."; + CHECK_EQ(param_.test_iter_size(), num_test_nets) + << "test_iter must be specified for each test network."; } // If we have a generic net (specified by net or net_param, rather than // test_net or test_net_param), we may have an unlimited number of actual @@ -114,16 +115,16 @@ void Solver::InitTestNets() { CHECK_GT(param_.test_interval(), 0); } int test_net_id = 0; - vector sources(num_test_net_instances); - vector net_params(num_test_net_instances); + vector < string > sources(num_test_net_instances); + vector < NetParameter > net_params(num_test_net_instances); for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) { - sources[test_net_id] = "test_net_param"; - net_params[test_net_id].CopyFrom(param_.test_net_param(i)); + sources[test_net_id] = "test_net_param"; + net_params[test_net_id].CopyFrom(param_.test_net_param(i)); } for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) { - sources[test_net_id] = "test_net file: " + param_.test_net(i); - ReadNetParamsFromTextFileOrDie(param_.test_net(i), - &net_params[test_net_id]); + sources[test_net_id] = "test_net file: " + param_.test_net(i); + ReadNetParamsFromTextFileOrDie(param_.test_net(i), + &net_params[test_net_id]); } const int remaining_test_nets = param_.test_iter_size() - test_net_id; if (has_net_param) { @@ -151,8 +152,7 @@ void Solver::InitTestNets() { net_state.MergeFrom(param_.test_state(i)); } net_params[i].mutable_state()->CopyFrom(net_state); - LOG(INFO) - << "Creating test net (#" << i << ") specified by " << sources[i]; + LOG(INFO) << "Creating test net (#" << i << ") specified by " << sources[i]; test_nets_[i].reset(new Net(net_params[i])); test_nets_[i]->set_debug_info(param_.debug_info()); } @@ -164,13 +164,13 @@ void Solver::Step(int iters) { const int start_iter = iter_; const int stop_iter = iter_ + iters; int average_loss = this->param_.average_loss(); - vector losses; + vector < Dtype > losses; Dtype smoothed_loss = 0; while (iter_ < stop_iter) { // zero-init the params for (int i = 0; i < net_->params().size(); ++i) { - shared_ptr > blob = net_->params()[i]; + shared_ptr < Blob > blob = net_->params()[i]; switch (Caffe::mode()) { case Caffe::CPU: caffe_set(blob->count(), static_cast(0), @@ -182,6 +182,13 @@ void Solver::Step(int iters) { blob->mutable_gpu_diff()); #else NO_GPU; +#endif + case Caffe::APU: +#ifndef CPU_ONLY + caffe_gpu_set(blob->count(), static_cast(0), + blob->mutable_gpu_diff()); +#else + NO_GPU; #endif break; } @@ -223,12 +230,11 @@ void Solver::Step(int iters) { for (int k = 0; k < result[j]->count(); ++k) { ostringstream loss_msg_stream; if (loss_weight) { - loss_msg_stream << " (* " << loss_weight - << " = " << loss_weight * result_vec[k] << " loss)"; + loss_msg_stream << " (* " << loss_weight << " = " + << loss_weight * result_vec[k] << " loss)"; } - LOG(INFO) << " Train net output #" - << score_index++ << ": " << output_name << " = " - << result_vec[k] << loss_msg_stream.str(); + LOG(INFO) << " Train net output #" << score_index++ << ": " + << output_name << " = " << result_vec[k] << loss_msg_stream.str(); } } } @@ -281,7 +287,6 @@ void Solver::Solve(const char* resume_file) { LOG(INFO) << "Optimization Done."; } - template void Solver::TestAll() { for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) { @@ -291,19 +296,19 @@ void Solver::TestAll() { template void Solver::Test(const int test_net_id) { - LOG(INFO) << "Iteration " << iter_ - << ", Testing net (#" << test_net_id << ")"; - CHECK_NOTNULL(test_nets_[test_net_id].get())-> - ShareTrainedLayersWith(net_.get()); - vector test_score; + LOG(INFO) << "Iteration " << iter_ << ", Testing net (#" << test_net_id + << ")"; + CHECK_NOTNULL(test_nets_[test_net_id].get())->ShareTrainedLayersWith( + net_.get()); + vector < Dtype > test_score; vector test_score_output_id; vector*> bottom_vec; const shared_ptr >& test_net = test_nets_[test_net_id]; Dtype loss = 0; for (int i = 0; i < param_.test_iter(test_net_id); ++i) { Dtype iter_loss; - const vector*>& result = - test_net->Forward(bottom_vec, &iter_loss); + const vector*>& result = test_net->Forward(bottom_vec, + &iter_loss); if (param_.test_compute_loss()) { loss += iter_loss; } @@ -337,15 +342,14 @@ void Solver::Test(const int test_net_id) { ostringstream loss_msg_stream; const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id); if (loss_weight) { - loss_msg_stream << " (* " << loss_weight - << " = " << loss_weight * mean_score << " loss)"; + loss_msg_stream << " (* " << loss_weight << " = " + << loss_weight * mean_score << " loss)"; } LOG(INFO) << " Test net output #" << i << ": " << output_name << " = " << mean_score << loss_msg_stream.str(); } } - template void Solver::Snapshot() { NetParameter net_param; @@ -384,7 +388,6 @@ void Solver::Restore(const char* state_file) { RestoreSolverState(state); } - // Return the current learning rate. The currently implemented learning rate // policies are as follows: // - fixed: always return base_lr. @@ -408,31 +411,36 @@ Dtype SGDSolver::GetLearningRate() { rate = this->param_.base_lr(); } else if (lr_policy == "step") { this->current_step_ = this->iter_ / this->param_.stepsize(); - rate = this->param_.base_lr() * - pow(this->param_.gamma(), this->current_step_); + rate = this->param_.base_lr() + * pow(this->param_.gamma(), this->current_step_); } else if (lr_policy == "exp") { rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_); } else if (lr_policy == "inv") { - rate = this->param_.base_lr() * - pow(Dtype(1) + this->param_.gamma() * this->iter_, - - this->param_.power()); + rate = this->param_.base_lr() + * pow(Dtype(1) + this->param_.gamma() * this->iter_, + -this->param_.power()); } else if (lr_policy == "multistep") { - if (this->current_step_ < this->param_.stepvalue_size() && - this->iter_ >= this->param_.stepvalue(this->current_step_)) { + if (this->current_step_ < this->param_.stepvalue_size() + && this->iter_ >= this->param_.stepvalue(this->current_step_)) { this->current_step_++; - LOG(INFO) << "MultiStep Status: Iteration " << - this->iter_ << ", step = " << this->current_step_; + LOG(INFO) << "MultiStep Status: Iteration " << this->iter_ << ", step = " + << this->current_step_; } - rate = this->param_.base_lr() * - pow(this->param_.gamma(), this->current_step_); + rate = this->param_.base_lr() + * pow(this->param_.gamma(), this->current_step_); } else if (lr_policy == "poly") { - rate = this->param_.base_lr() * pow(Dtype(1.) - - (Dtype(this->iter_) / Dtype(this->param_.max_iter())), - this->param_.power()); + rate = this->param_.base_lr() + * pow(Dtype(1.) - (Dtype(this->iter_) / Dtype(this->param_.max_iter())), + this->param_.power()); } else if (lr_policy == "sigmoid") { - rate = this->param_.base_lr() * (Dtype(1.) / - (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) - - Dtype(this->param_.stepsize()))))); + rate = + this->param_.base_lr() + * (Dtype(1.) + / (Dtype(1.) + + exp( + -this->param_.gamma() + * (Dtype(this->iter_) + - Dtype(this->param_.stepsize()))))); } else { LOG(FATAL) << "Unknown learning rate policy: " << lr_policy; } @@ -448,16 +456,18 @@ void SGDSolver::PreSolve() { temp_.clear(); for (int i = 0; i < net_params.size(); ++i) { const vector& shape = net_params[i]->shape(); - history_.push_back(shared_ptr >(new Blob(shape))); - update_.push_back(shared_ptr >(new Blob(shape))); - temp_.push_back(shared_ptr >(new Blob(shape))); + history_.push_back(shared_ptr < Blob > (new Blob(shape))); + update_.push_back(shared_ptr < Blob > (new Blob(shape))); + temp_.push_back(shared_ptr < Blob > (new Blob(shape))); } } template void SGDSolver::ClipGradients() { const Dtype clip_gradients = this->param_.clip_gradients(); - if (clip_gradients < 0) { return; } + if (clip_gradients < 0) { + return; + } const vector > >& net_params = this->net_->params(); Dtype sumsq_diff = 0; for (int i = 0; i < net_params.size(); ++i) { @@ -469,8 +479,8 @@ void SGDSolver::ClipGradients() { if (l2norm_diff > clip_gradients) { Dtype scale_factor = clip_gradients / l2norm_diff; LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm " - << l2norm_diff << " > " << clip_gradients << ") " - << "by scale factor " << scale_factor; + << l2norm_diff << " > " << clip_gradients << ") " << "by scale factor " + << scale_factor; for (int i = 0; i < net_params.size(); ++i) { if (this->net_->param_owners()[i] < 0) { net_params[i]->scale_diff(scale_factor); @@ -496,7 +506,9 @@ void SGDSolver::ApplyUpdate() { template void SGDSolver::Normalize(int param_id) { - if (this->param_.iter_size() == 1) { return; } + if (this->param_.iter_size() == 1) { + return; + } // Scale gradient to counterbalance accumulation. const vector > >& net_params = this->net_->params(); const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); @@ -528,21 +540,20 @@ void SGDSolver::Regularize(int param_id) { Dtype weight_decay = this->param_.weight_decay(); string regularization_type = this->param_.regularization_type(); Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + switch (Caffe::mode()) { case Caffe::CPU: { if (local_decay) { if (regularization_type == "L2") { // add weight decay - caffe_axpy(net_params[param_id]->count(), - local_decay, + caffe_axpy(net_params[param_id]->count(), local_decay, net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); } else if (regularization_type == "L1") { caffe_cpu_sign(net_params[param_id]->count(), net_params[param_id]->cpu_data(), temp_[param_id]->mutable_cpu_data()); - caffe_axpy(net_params[param_id]->count(), - local_decay, + caffe_axpy(net_params[param_id]->count(), local_decay, temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); } else { @@ -556,16 +567,14 @@ void SGDSolver::Regularize(int param_id) { if (local_decay) { if (regularization_type == "L2") { // add weight decay - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, + caffe_gpu_axpy(net_params[param_id]->count(), local_decay, net_params[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); } else if (regularization_type == "L1") { caffe_gpu_sign(net_params[param_id]->count(), net_params[param_id]->gpu_data(), temp_[param_id]->mutable_gpu_data()); - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, + caffe_gpu_axpy(net_params[param_id]->count(), local_decay, temp_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); } else { @@ -592,19 +601,18 @@ void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { switch (Caffe::mode()) { case Caffe::CPU: { caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - history_[param_id]->mutable_cpu_data()); - caffe_copy(net_params[param_id]->count(), - history_[param_id]->cpu_data(), + net_params[param_id]->cpu_diff(), momentum, + history_[param_id]->mutable_cpu_data()); + caffe_copy(net_params[param_id]->count(), history_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - history_[param_id]->mutable_gpu_data()); - caffe_copy(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), momentum, + history_[param_id]->mutable_gpu_data()); + caffe_gpu_copy(net_params[param_id]->count(), history_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); #else @@ -652,8 +660,8 @@ void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { // update history caffe_cpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->cpu_diff(), momentum, - this->history_[param_id]->mutable_cpu_data()); + net_params[param_id]->cpu_diff(), momentum, + this->history_[param_id]->mutable_cpu_data()); // compute update: step back then over step caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, @@ -669,14 +677,14 @@ void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { case Caffe::GPU: { #ifndef CPU_ONLY // save history momentum for stepping back - caffe_copy(net_params[param_id]->count(), + caffe_gpu_copy(net_params[param_id]->count(), this->history_[param_id]->gpu_data(), this->update_[param_id]->mutable_gpu_data()); // update history caffe_gpu_axpby(net_params[param_id]->count(), local_rate, - net_params[param_id]->gpu_diff(), momentum, - this->history_[param_id]->mutable_gpu_data()); + net_params[param_id]->gpu_diff(), momentum, + this->history_[param_id]->mutable_gpu_data()); // compute update: step back then over step caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, @@ -684,7 +692,7 @@ void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { this->update_[param_id]->mutable_gpu_data()); // copy - caffe_copy(net_params[param_id]->count(), + caffe_gpu_copy(net_params[param_id]->count(), this->update_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); #else @@ -706,9 +714,8 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { switch (Caffe::mode()) { case Caffe::CPU: { // compute square of gradient in update - caffe_powx(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), Dtype(2), - this->update_[param_id]->mutable_cpu_data()); + caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), + Dtype(2), this->update_[param_id]->mutable_cpu_data()); // update history caffe_add(net_params[param_id]->count(), @@ -718,16 +725,15 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { // prepare update caffe_powx(net_params[param_id]->count(), - this->history_[param_id]->cpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_cpu_data()); + this->history_[param_id]->cpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_cpu_data()); - caffe_add_scalar(net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_cpu_data()); + caffe_add_scalar(net_params[param_id]->count(), delta, + this->update_[param_id]->mutable_cpu_data()); - caffe_div(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), - this->update_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); + caffe_div(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), + this->update_[param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); // scale and copy caffe_cpu_axpby(net_params[param_id]->count(), local_rate, @@ -750,16 +756,15 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { // prepare update caffe_gpu_powx(net_params[param_id]->count(), - this->history_[param_id]->gpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_gpu_data()); + this->history_[param_id]->gpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_gpu_data()); - caffe_gpu_add_scalar(net_params[param_id]->count(), - delta, this->update_[param_id]->mutable_gpu_data()); + caffe_gpu_add_scalar < Dtype + > (net_params[param_id]->count(), delta, this->update_[param_id]->mutable_gpu_data()); caffe_gpu_div(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), - this->update_[param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); + net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); // scale and copy caffe_gpu_axpby(net_params[param_id]->count(), local_rate, @@ -775,9 +780,9 @@ void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { } } -INSTANTIATE_CLASS(Solver); -INSTANTIATE_CLASS(SGDSolver); -INSTANTIATE_CLASS(NesterovSolver); -INSTANTIATE_CLASS(AdaGradSolver); +INSTANTIATE_CLASS (Solver); +INSTANTIATE_CLASS (SGDSolver); +INSTANTIATE_CLASS (NesterovSolver); +INSTANTIATE_CLASS (AdaGradSolver); } // namespace caffe diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp index 7617ccfb..76d3f2ea 100644 --- a/src/caffe/syncedmem.cpp +++ b/src/caffe/syncedmem.cpp @@ -1,43 +1,104 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #include #include "caffe/common.hpp" #include "caffe/syncedmem.hpp" #include "caffe/util/math_functions.hpp" +#include "caffe/util/ocl_util.hpp" + +#define CL_MEM_USE_PERSISTENT_MEM_AMD (1 << 6)//specific for AMD devices namespace caffe { SyncedMemory::~SyncedMemory() { +#ifndef CPU_ONLY if (cpu_ptr_ && own_cpu_data_) { - CaffeFreeHost(cpu_ptr_); + OCL_CHECK( + clEnqueueUnmapMemObject(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, + cpu_ptr_, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); + } + if (gpu_cache_ptr_ && own_cpu_data_) { + OCL_CHECK(clReleaseMemObject((cl_mem) gpu_cache_ptr_)); } - -#ifndef CPU_ONLY if (gpu_ptr_) { - CUDA_CHECK(cudaFree(gpu_ptr_)); + OCL_CHECK(clReleaseMemObject((cl_mem) gpu_ptr_)); } -#endif // CPU_ONLY + + clReleaseKernel (oclmem_kernel); +#endif +} + +//begin: code written/modified by AMD. +#ifndef CPU_ONLY +void SyncedMemory::ocl_setup() { + cl_int err = 0; + oclmem_kernel = clCreateKernel(amdDevice.Program, "OCL_memset2", &err); + OCL_CHECK(err); } +#endif inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: +#ifndef CPU_ONLY + gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, + size_, NULL, NULL); + cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, + (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size_, + 0, NULL, NULL, NULL); +#else CaffeMallocHost(&cpu_ptr_, size_); - caffe_memset(size_, 0, cpu_ptr_); +#endif + memset(cpu_ptr_, 0, size_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; - case HEAD_AT_GPU: + case HEAD_AT_GPU: { #ifndef CPU_ONLY if (cpu_ptr_ == NULL) { - CaffeMallocHost(&cpu_ptr_, size_); + gpu_cache_ptr_ = clCreateBuffer(amdDevice.Context, CL_MEM_ALLOC_HOST_PTR, + size_, NULL, NULL); + cpu_ptr_ = clEnqueueMapBuffer(amdDevice.CommandQueue, + (cl_mem) gpu_cache_ptr_, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, + size_, 0, NULL, NULL, NULL); own_cpu_data_ = true; } - caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_ptr_, + (cl_mem) gpu_cache_ptr_, 0, 0, size_, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); head_ = SYNCED; #else NO_GPU; #endif break; + } case HEAD_AT_CPU: case SYNCED: break; @@ -47,18 +108,34 @@ inline void SyncedMemory::to_cpu() { inline void SyncedMemory::to_gpu() { #ifndef CPU_ONLY switch (head_) { - case UNINITIALIZED: - CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); - caffe_gpu_memset(size_, 0, gpu_ptr_); + case UNINITIALIZED: { + cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, size_, + NULL, NULL); + if (NULL == tmpMem) { + fprintf(stderr, "Failed to create memory object\n"); + break; + } + ocl_memset(tmpMem, (int) 0, (int) (size_ / sizeof(int))); + gpu_ptr_ = (void*) tmpMem; head_ = HEAD_AT_GPU; break; - case HEAD_AT_CPU: + } + case HEAD_AT_CPU: { if (gpu_ptr_ == NULL) { - CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_)); + cl_mem tmpMem = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + size_, NULL, NULL); + if (NULL == tmpMem) { + fprintf(stderr, "Failed to create memory object\n"); + } + gpu_ptr_ = (void*) tmpMem; } - caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_); + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) gpu_cache_ptr_, + (cl_mem) gpu_ptr_, 0, 0, size_, 0, NULL, NULL)); + clFinish(amdDevice.CommandQueue); head_ = SYNCED; break; + } case HEAD_AT_GPU: case SYNCED: break; @@ -70,13 +147,13 @@ inline void SyncedMemory::to_gpu() { const void* SyncedMemory::cpu_data() { to_cpu(); - return (const void*)cpu_ptr_; + return (const void*) cpu_ptr_; } void SyncedMemory::set_cpu_data(void* data) { CHECK(data); if (own_cpu_data_) { - CaffeFreeHost(cpu_ptr_); + CaffeFreeHost (cpu_ptr_); } cpu_ptr_ = data; head_ = HEAD_AT_CPU; @@ -86,7 +163,7 @@ void SyncedMemory::set_cpu_data(void* data) { const void* SyncedMemory::gpu_data() { #ifndef CPU_ONLY to_gpu(); - return (const void*)gpu_ptr_; + return (const void*) gpu_ptr_; #else NO_GPU; #endif @@ -108,6 +185,9 @@ void* SyncedMemory::mutable_gpu_data() { #endif } +const void *SyncedMemory::gpu_cache_data() { + return 0; +} } // namespace caffe diff --git a/src/caffe/test/Makefile b/src/caffe/test/Makefile new file mode 100644 index 00000000..c9e785c7 --- /dev/null +++ b/src/caffe/test/Makefile @@ -0,0 +1,1766 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 2.8 + +# Default target executed when no arguments are given to make. +default_target: all +.PHONY : default_target + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The program to use to edit the cache. +CMAKE_EDIT_COMMAND = /usr/bin/ccmake + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/yugao/caffe-merge-junli/caffe-yb/caffe + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..." + /usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache +.PHONY : edit_cache/fast + +# Special rule for the target install +install: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install + +# Special rule for the target install +install/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install/fast + +# Special rule for the target install/local +install/local: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local + +# Special rule for the target install/local +install/local/fast: install/local +.PHONY : install/local/fast + +# Special rule for the target install/strip +install/strip: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip + +# Special rule for the target install/strip +install/strip/fast: install/strip +.PHONY : install/strip/fast + +# Special rule for the target list_install_components +list_install_components: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\"" +.PHONY : list_install_components + +# Special rule for the target list_install_components +list_install_components/fast: list_install_components +.PHONY : list_install_components/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test/CMakeFiles/progress.marks + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/all + $(CMAKE_COMMAND) -E cmake_progress_start /home/yugao/caffe-merge-junli/caffe-yb/caffe/CMakeFiles 0 +.PHONY : all + +# The main clean target +clean: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/clean +.PHONY : clean + +# The main clean target +clean/fast: clean +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +src/caffe/test/CMakeFiles/runtest.dir/rule: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/CMakeFiles/runtest.dir/rule +.PHONY : src/caffe/test/CMakeFiles/runtest.dir/rule + +# Convenience name for target. +runtest: src/caffe/test/CMakeFiles/runtest.dir/rule +.PHONY : runtest + +# fast build rule for target. +runtest/fast: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/runtest.dir/build.make src/caffe/test/CMakeFiles/runtest.dir/build +.PHONY : runtest/fast + +# Convenience name for target. +src/caffe/test/CMakeFiles/test.testbin.dir/rule: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f CMakeFiles/Makefile2 src/caffe/test/CMakeFiles/test.testbin.dir/rule +.PHONY : src/caffe/test/CMakeFiles/test.testbin.dir/rule + +# Convenience name for target. +test.testbin: src/caffe/test/CMakeFiles/test.testbin.dir/rule +.PHONY : test.testbin + +# fast build rule for target. +test.testbin/fast: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/build +.PHONY : test.testbin/fast + +test_accuracy_layer.o: test_accuracy_layer.cpp.o +.PHONY : test_accuracy_layer.o + +# target to build an object file +test_accuracy_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.o +.PHONY : test_accuracy_layer.cpp.o + +test_accuracy_layer.i: test_accuracy_layer.cpp.i +.PHONY : test_accuracy_layer.i + +# target to preprocess a source file +test_accuracy_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.i +.PHONY : test_accuracy_layer.cpp.i + +test_accuracy_layer.s: test_accuracy_layer.cpp.s +.PHONY : test_accuracy_layer.s + +# target to generate assembly for a file +test_accuracy_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_accuracy_layer.cpp.s +.PHONY : test_accuracy_layer.cpp.s + +test_argmax_layer.o: test_argmax_layer.cpp.o +.PHONY : test_argmax_layer.o + +# target to build an object file +test_argmax_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.o +.PHONY : test_argmax_layer.cpp.o + +test_argmax_layer.i: test_argmax_layer.cpp.i +.PHONY : test_argmax_layer.i + +# target to preprocess a source file +test_argmax_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.i +.PHONY : test_argmax_layer.cpp.i + +test_argmax_layer.s: test_argmax_layer.cpp.s +.PHONY : test_argmax_layer.s + +# target to generate assembly for a file +test_argmax_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_argmax_layer.cpp.s +.PHONY : test_argmax_layer.cpp.s + +test_benchmark.o: test_benchmark.cpp.o +.PHONY : test_benchmark.o + +# target to build an object file +test_benchmark.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.o +.PHONY : test_benchmark.cpp.o + +test_benchmark.i: test_benchmark.cpp.i +.PHONY : test_benchmark.i + +# target to preprocess a source file +test_benchmark.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.i +.PHONY : test_benchmark.cpp.i + +test_benchmark.s: test_benchmark.cpp.s +.PHONY : test_benchmark.s + +# target to generate assembly for a file +test_benchmark.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_benchmark.cpp.s +.PHONY : test_benchmark.cpp.s + +test_blob.o: test_blob.cpp.o +.PHONY : test_blob.o + +# target to build an object file +test_blob.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.o +.PHONY : test_blob.cpp.o + +test_blob.i: test_blob.cpp.i +.PHONY : test_blob.i + +# target to preprocess a source file +test_blob.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.i +.PHONY : test_blob.cpp.i + +test_blob.s: test_blob.cpp.s +.PHONY : test_blob.s + +# target to generate assembly for a file +test_blob.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_blob.cpp.s +.PHONY : test_blob.cpp.s + +test_caffe_main.o: test_caffe_main.cpp.o +.PHONY : test_caffe_main.o + +# target to build an object file +test_caffe_main.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.o +.PHONY : test_caffe_main.cpp.o + +test_caffe_main.i: test_caffe_main.cpp.i +.PHONY : test_caffe_main.i + +# target to preprocess a source file +test_caffe_main.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.i +.PHONY : test_caffe_main.cpp.i + +test_caffe_main.s: test_caffe_main.cpp.s +.PHONY : test_caffe_main.s + +# target to generate assembly for a file +test_caffe_main.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_caffe_main.cpp.s +.PHONY : test_caffe_main.cpp.s + +test_common.o: test_common.cpp.o +.PHONY : test_common.o + +# target to build an object file +test_common.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.o +.PHONY : test_common.cpp.o + +test_common.i: test_common.cpp.i +.PHONY : test_common.i + +# target to preprocess a source file +test_common.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.i +.PHONY : test_common.cpp.i + +test_common.s: test_common.cpp.s +.PHONY : test_common.s + +# target to generate assembly for a file +test_common.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_common.cpp.s +.PHONY : test_common.cpp.s + +test_concat_layer.o: test_concat_layer.cpp.o +.PHONY : test_concat_layer.o + +# target to build an object file +test_concat_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.o +.PHONY : test_concat_layer.cpp.o + +test_concat_layer.i: test_concat_layer.cpp.i +.PHONY : test_concat_layer.i + +# target to preprocess a source file +test_concat_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.i +.PHONY : test_concat_layer.cpp.i + +test_concat_layer.s: test_concat_layer.cpp.s +.PHONY : test_concat_layer.s + +# target to generate assembly for a file +test_concat_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_concat_layer.cpp.s +.PHONY : test_concat_layer.cpp.s + +test_contrastive_loss_layer.o: test_contrastive_loss_layer.cpp.o +.PHONY : test_contrastive_loss_layer.o + +# target to build an object file +test_contrastive_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.o +.PHONY : test_contrastive_loss_layer.cpp.o + +test_contrastive_loss_layer.i: test_contrastive_loss_layer.cpp.i +.PHONY : test_contrastive_loss_layer.i + +# target to preprocess a source file +test_contrastive_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.i +.PHONY : test_contrastive_loss_layer.cpp.i + +test_contrastive_loss_layer.s: test_contrastive_loss_layer.cpp.s +.PHONY : test_contrastive_loss_layer.s + +# target to generate assembly for a file +test_contrastive_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_contrastive_loss_layer.cpp.s +.PHONY : test_contrastive_loss_layer.cpp.s + +test_convolution_layer.o: test_convolution_layer.cpp.o +.PHONY : test_convolution_layer.o + +# target to build an object file +test_convolution_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.o +.PHONY : test_convolution_layer.cpp.o + +test_convolution_layer.i: test_convolution_layer.cpp.i +.PHONY : test_convolution_layer.i + +# target to preprocess a source file +test_convolution_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.i +.PHONY : test_convolution_layer.cpp.i + +test_convolution_layer.s: test_convolution_layer.cpp.s +.PHONY : test_convolution_layer.s + +# target to generate assembly for a file +test_convolution_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_convolution_layer.cpp.s +.PHONY : test_convolution_layer.cpp.s + +test_data_layer.o: test_data_layer.cpp.o +.PHONY : test_data_layer.o + +# target to build an object file +test_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.o +.PHONY : test_data_layer.cpp.o + +test_data_layer.i: test_data_layer.cpp.i +.PHONY : test_data_layer.i + +# target to preprocess a source file +test_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.i +.PHONY : test_data_layer.cpp.i + +test_data_layer.s: test_data_layer.cpp.s +.PHONY : test_data_layer.s + +# target to generate assembly for a file +test_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_layer.cpp.s +.PHONY : test_data_layer.cpp.s + +test_data_transformer.o: test_data_transformer.cpp.o +.PHONY : test_data_transformer.o + +# target to build an object file +test_data_transformer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.o +.PHONY : test_data_transformer.cpp.o + +test_data_transformer.i: test_data_transformer.cpp.i +.PHONY : test_data_transformer.i + +# target to preprocess a source file +test_data_transformer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.i +.PHONY : test_data_transformer.cpp.i + +test_data_transformer.s: test_data_transformer.cpp.s +.PHONY : test_data_transformer.s + +# target to generate assembly for a file +test_data_transformer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_data_transformer.cpp.s +.PHONY : test_data_transformer.cpp.s + +test_db.o: test_db.cpp.o +.PHONY : test_db.o + +# target to build an object file +test_db.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.o +.PHONY : test_db.cpp.o + +test_db.i: test_db.cpp.i +.PHONY : test_db.i + +# target to preprocess a source file +test_db.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.i +.PHONY : test_db.cpp.i + +test_db.s: test_db.cpp.s +.PHONY : test_db.s + +# target to generate assembly for a file +test_db.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_db.cpp.s +.PHONY : test_db.cpp.s + +test_deconvolution_layer.o: test_deconvolution_layer.cpp.o +.PHONY : test_deconvolution_layer.o + +# target to build an object file +test_deconvolution_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.o +.PHONY : test_deconvolution_layer.cpp.o + +test_deconvolution_layer.i: test_deconvolution_layer.cpp.i +.PHONY : test_deconvolution_layer.i + +# target to preprocess a source file +test_deconvolution_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.i +.PHONY : test_deconvolution_layer.cpp.i + +test_deconvolution_layer.s: test_deconvolution_layer.cpp.s +.PHONY : test_deconvolution_layer.s + +# target to generate assembly for a file +test_deconvolution_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_deconvolution_layer.cpp.s +.PHONY : test_deconvolution_layer.cpp.s + +test_dummy_data_layer.o: test_dummy_data_layer.cpp.o +.PHONY : test_dummy_data_layer.o + +# target to build an object file +test_dummy_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.o +.PHONY : test_dummy_data_layer.cpp.o + +test_dummy_data_layer.i: test_dummy_data_layer.cpp.i +.PHONY : test_dummy_data_layer.i + +# target to preprocess a source file +test_dummy_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.i +.PHONY : test_dummy_data_layer.cpp.i + +test_dummy_data_layer.s: test_dummy_data_layer.cpp.s +.PHONY : test_dummy_data_layer.s + +# target to generate assembly for a file +test_dummy_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_dummy_data_layer.cpp.s +.PHONY : test_dummy_data_layer.cpp.s + +test_eltwise_layer.o: test_eltwise_layer.cpp.o +.PHONY : test_eltwise_layer.o + +# target to build an object file +test_eltwise_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.o +.PHONY : test_eltwise_layer.cpp.o + +test_eltwise_layer.i: test_eltwise_layer.cpp.i +.PHONY : test_eltwise_layer.i + +# target to preprocess a source file +test_eltwise_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.i +.PHONY : test_eltwise_layer.cpp.i + +test_eltwise_layer.s: test_eltwise_layer.cpp.s +.PHONY : test_eltwise_layer.s + +# target to generate assembly for a file +test_eltwise_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_eltwise_layer.cpp.s +.PHONY : test_eltwise_layer.cpp.s + +test_euclidean_loss_layer.o: test_euclidean_loss_layer.cpp.o +.PHONY : test_euclidean_loss_layer.o + +# target to build an object file +test_euclidean_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.o +.PHONY : test_euclidean_loss_layer.cpp.o + +test_euclidean_loss_layer.i: test_euclidean_loss_layer.cpp.i +.PHONY : test_euclidean_loss_layer.i + +# target to preprocess a source file +test_euclidean_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.i +.PHONY : test_euclidean_loss_layer.cpp.i + +test_euclidean_loss_layer.s: test_euclidean_loss_layer.cpp.s +.PHONY : test_euclidean_loss_layer.s + +# target to generate assembly for a file +test_euclidean_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_euclidean_loss_layer.cpp.s +.PHONY : test_euclidean_loss_layer.cpp.s + +test_filler.o: test_filler.cpp.o +.PHONY : test_filler.o + +# target to build an object file +test_filler.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.o +.PHONY : test_filler.cpp.o + +test_filler.i: test_filler.cpp.i +.PHONY : test_filler.i + +# target to preprocess a source file +test_filler.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.i +.PHONY : test_filler.cpp.i + +test_filler.s: test_filler.cpp.s +.PHONY : test_filler.s + +# target to generate assembly for a file +test_filler.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filler.cpp.s +.PHONY : test_filler.cpp.s + +test_filter_layer.o: test_filter_layer.cpp.o +.PHONY : test_filter_layer.o + +# target to build an object file +test_filter_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.o +.PHONY : test_filter_layer.cpp.o + +test_filter_layer.i: test_filter_layer.cpp.i +.PHONY : test_filter_layer.i + +# target to preprocess a source file +test_filter_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.i +.PHONY : test_filter_layer.cpp.i + +test_filter_layer.s: test_filter_layer.cpp.s +.PHONY : test_filter_layer.s + +# target to generate assembly for a file +test_filter_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_filter_layer.cpp.s +.PHONY : test_filter_layer.cpp.s + +test_flatten_layer.o: test_flatten_layer.cpp.o +.PHONY : test_flatten_layer.o + +# target to build an object file +test_flatten_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.o +.PHONY : test_flatten_layer.cpp.o + +test_flatten_layer.i: test_flatten_layer.cpp.i +.PHONY : test_flatten_layer.i + +# target to preprocess a source file +test_flatten_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.i +.PHONY : test_flatten_layer.cpp.i + +test_flatten_layer.s: test_flatten_layer.cpp.s +.PHONY : test_flatten_layer.s + +# target to generate assembly for a file +test_flatten_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_flatten_layer.cpp.s +.PHONY : test_flatten_layer.cpp.s + +test_gradient_based_solver.o: test_gradient_based_solver.cpp.o +.PHONY : test_gradient_based_solver.o + +# target to build an object file +test_gradient_based_solver.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.o +.PHONY : test_gradient_based_solver.cpp.o + +test_gradient_based_solver.i: test_gradient_based_solver.cpp.i +.PHONY : test_gradient_based_solver.i + +# target to preprocess a source file +test_gradient_based_solver.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.i +.PHONY : test_gradient_based_solver.cpp.i + +test_gradient_based_solver.s: test_gradient_based_solver.cpp.s +.PHONY : test_gradient_based_solver.s + +# target to generate assembly for a file +test_gradient_based_solver.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_gradient_based_solver.cpp.s +.PHONY : test_gradient_based_solver.cpp.s + +test_hdf5_output_layer.o: test_hdf5_output_layer.cpp.o +.PHONY : test_hdf5_output_layer.o + +# target to build an object file +test_hdf5_output_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.o +.PHONY : test_hdf5_output_layer.cpp.o + +test_hdf5_output_layer.i: test_hdf5_output_layer.cpp.i +.PHONY : test_hdf5_output_layer.i + +# target to preprocess a source file +test_hdf5_output_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.i +.PHONY : test_hdf5_output_layer.cpp.i + +test_hdf5_output_layer.s: test_hdf5_output_layer.cpp.s +.PHONY : test_hdf5_output_layer.s + +# target to generate assembly for a file +test_hdf5_output_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5_output_layer.cpp.s +.PHONY : test_hdf5_output_layer.cpp.s + +test_hdf5data_layer.o: test_hdf5data_layer.cpp.o +.PHONY : test_hdf5data_layer.o + +# target to build an object file +test_hdf5data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.o +.PHONY : test_hdf5data_layer.cpp.o + +test_hdf5data_layer.i: test_hdf5data_layer.cpp.i +.PHONY : test_hdf5data_layer.i + +# target to preprocess a source file +test_hdf5data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.i +.PHONY : test_hdf5data_layer.cpp.i + +test_hdf5data_layer.s: test_hdf5data_layer.cpp.s +.PHONY : test_hdf5data_layer.s + +# target to generate assembly for a file +test_hdf5data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hdf5data_layer.cpp.s +.PHONY : test_hdf5data_layer.cpp.s + +test_hinge_loss_layer.o: test_hinge_loss_layer.cpp.o +.PHONY : test_hinge_loss_layer.o + +# target to build an object file +test_hinge_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.o +.PHONY : test_hinge_loss_layer.cpp.o + +test_hinge_loss_layer.i: test_hinge_loss_layer.cpp.i +.PHONY : test_hinge_loss_layer.i + +# target to preprocess a source file +test_hinge_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.i +.PHONY : test_hinge_loss_layer.cpp.i + +test_hinge_loss_layer.s: test_hinge_loss_layer.cpp.s +.PHONY : test_hinge_loss_layer.s + +# target to generate assembly for a file +test_hinge_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_hinge_loss_layer.cpp.s +.PHONY : test_hinge_loss_layer.cpp.s + +test_im2col_layer.o: test_im2col_layer.cpp.o +.PHONY : test_im2col_layer.o + +# target to build an object file +test_im2col_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.o +.PHONY : test_im2col_layer.cpp.o + +test_im2col_layer.i: test_im2col_layer.cpp.i +.PHONY : test_im2col_layer.i + +# target to preprocess a source file +test_im2col_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.i +.PHONY : test_im2col_layer.cpp.i + +test_im2col_layer.s: test_im2col_layer.cpp.s +.PHONY : test_im2col_layer.s + +# target to generate assembly for a file +test_im2col_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_im2col_layer.cpp.s +.PHONY : test_im2col_layer.cpp.s + +test_image_data_layer.o: test_image_data_layer.cpp.o +.PHONY : test_image_data_layer.o + +# target to build an object file +test_image_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.o +.PHONY : test_image_data_layer.cpp.o + +test_image_data_layer.i: test_image_data_layer.cpp.i +.PHONY : test_image_data_layer.i + +# target to preprocess a source file +test_image_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.i +.PHONY : test_image_data_layer.cpp.i + +test_image_data_layer.s: test_image_data_layer.cpp.s +.PHONY : test_image_data_layer.s + +# target to generate assembly for a file +test_image_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_image_data_layer.cpp.s +.PHONY : test_image_data_layer.cpp.s + +test_infogain_loss_layer.o: test_infogain_loss_layer.cpp.o +.PHONY : test_infogain_loss_layer.o + +# target to build an object file +test_infogain_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.o +.PHONY : test_infogain_loss_layer.cpp.o + +test_infogain_loss_layer.i: test_infogain_loss_layer.cpp.i +.PHONY : test_infogain_loss_layer.i + +# target to preprocess a source file +test_infogain_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.i +.PHONY : test_infogain_loss_layer.cpp.i + +test_infogain_loss_layer.s: test_infogain_loss_layer.cpp.s +.PHONY : test_infogain_loss_layer.s + +# target to generate assembly for a file +test_infogain_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_infogain_loss_layer.cpp.s +.PHONY : test_infogain_loss_layer.cpp.s + +test_inner_product_layer.o: test_inner_product_layer.cpp.o +.PHONY : test_inner_product_layer.o + +# target to build an object file +test_inner_product_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.o +.PHONY : test_inner_product_layer.cpp.o + +test_inner_product_layer.i: test_inner_product_layer.cpp.i +.PHONY : test_inner_product_layer.i + +# target to preprocess a source file +test_inner_product_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.i +.PHONY : test_inner_product_layer.cpp.i + +test_inner_product_layer.s: test_inner_product_layer.cpp.s +.PHONY : test_inner_product_layer.s + +# target to generate assembly for a file +test_inner_product_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_inner_product_layer.cpp.s +.PHONY : test_inner_product_layer.cpp.s + +test_internal_thread.o: test_internal_thread.cpp.o +.PHONY : test_internal_thread.o + +# target to build an object file +test_internal_thread.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.o +.PHONY : test_internal_thread.cpp.o + +test_internal_thread.i: test_internal_thread.cpp.i +.PHONY : test_internal_thread.i + +# target to preprocess a source file +test_internal_thread.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.i +.PHONY : test_internal_thread.cpp.i + +test_internal_thread.s: test_internal_thread.cpp.s +.PHONY : test_internal_thread.s + +# target to generate assembly for a file +test_internal_thread.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_internal_thread.cpp.s +.PHONY : test_internal_thread.cpp.s + +test_io.o: test_io.cpp.o +.PHONY : test_io.o + +# target to build an object file +test_io.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.o +.PHONY : test_io.cpp.o + +test_io.i: test_io.cpp.i +.PHONY : test_io.i + +# target to preprocess a source file +test_io.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.i +.PHONY : test_io.cpp.i + +test_io.s: test_io.cpp.s +.PHONY : test_io.s + +# target to generate assembly for a file +test_io.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_io.cpp.s +.PHONY : test_io.cpp.s + +test_layer_factory.o: test_layer_factory.cpp.o +.PHONY : test_layer_factory.o + +# target to build an object file +test_layer_factory.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.o +.PHONY : test_layer_factory.cpp.o + +test_layer_factory.i: test_layer_factory.cpp.i +.PHONY : test_layer_factory.i + +# target to preprocess a source file +test_layer_factory.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.i +.PHONY : test_layer_factory.cpp.i + +test_layer_factory.s: test_layer_factory.cpp.s +.PHONY : test_layer_factory.s + +# target to generate assembly for a file +test_layer_factory.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_layer_factory.cpp.s +.PHONY : test_layer_factory.cpp.s + +test_lrn_layer.o: test_lrn_layer.cpp.o +.PHONY : test_lrn_layer.o + +# target to build an object file +test_lrn_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.o +.PHONY : test_lrn_layer.cpp.o + +test_lrn_layer.i: test_lrn_layer.cpp.i +.PHONY : test_lrn_layer.i + +# target to preprocess a source file +test_lrn_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.i +.PHONY : test_lrn_layer.cpp.i + +test_lrn_layer.s: test_lrn_layer.cpp.s +.PHONY : test_lrn_layer.s + +# target to generate assembly for a file +test_lrn_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_lrn_layer.cpp.s +.PHONY : test_lrn_layer.cpp.s + +test_math_functions.o: test_math_functions.cpp.o +.PHONY : test_math_functions.o + +# target to build an object file +test_math_functions.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.o +.PHONY : test_math_functions.cpp.o + +test_math_functions.i: test_math_functions.cpp.i +.PHONY : test_math_functions.i + +# target to preprocess a source file +test_math_functions.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.i +.PHONY : test_math_functions.cpp.i + +test_math_functions.s: test_math_functions.cpp.s +.PHONY : test_math_functions.s + +# target to generate assembly for a file +test_math_functions.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_math_functions.cpp.s +.PHONY : test_math_functions.cpp.s + +test_maxpool_dropout_layers.o: test_maxpool_dropout_layers.cpp.o +.PHONY : test_maxpool_dropout_layers.o + +# target to build an object file +test_maxpool_dropout_layers.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.o +.PHONY : test_maxpool_dropout_layers.cpp.o + +test_maxpool_dropout_layers.i: test_maxpool_dropout_layers.cpp.i +.PHONY : test_maxpool_dropout_layers.i + +# target to preprocess a source file +test_maxpool_dropout_layers.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.i +.PHONY : test_maxpool_dropout_layers.cpp.i + +test_maxpool_dropout_layers.s: test_maxpool_dropout_layers.cpp.s +.PHONY : test_maxpool_dropout_layers.s + +# target to generate assembly for a file +test_maxpool_dropout_layers.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_maxpool_dropout_layers.cpp.s +.PHONY : test_maxpool_dropout_layers.cpp.s + +test_memory_data_layer.o: test_memory_data_layer.cpp.o +.PHONY : test_memory_data_layer.o + +# target to build an object file +test_memory_data_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.o +.PHONY : test_memory_data_layer.cpp.o + +test_memory_data_layer.i: test_memory_data_layer.cpp.i +.PHONY : test_memory_data_layer.i + +# target to preprocess a source file +test_memory_data_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.i +.PHONY : test_memory_data_layer.cpp.i + +test_memory_data_layer.s: test_memory_data_layer.cpp.s +.PHONY : test_memory_data_layer.s + +# target to generate assembly for a file +test_memory_data_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_memory_data_layer.cpp.s +.PHONY : test_memory_data_layer.cpp.s + +test_multinomial_logistic_loss_layer.o: test_multinomial_logistic_loss_layer.cpp.o +.PHONY : test_multinomial_logistic_loss_layer.o + +# target to build an object file +test_multinomial_logistic_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.o +.PHONY : test_multinomial_logistic_loss_layer.cpp.o + +test_multinomial_logistic_loss_layer.i: test_multinomial_logistic_loss_layer.cpp.i +.PHONY : test_multinomial_logistic_loss_layer.i + +# target to preprocess a source file +test_multinomial_logistic_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.i +.PHONY : test_multinomial_logistic_loss_layer.cpp.i + +test_multinomial_logistic_loss_layer.s: test_multinomial_logistic_loss_layer.cpp.s +.PHONY : test_multinomial_logistic_loss_layer.s + +# target to generate assembly for a file +test_multinomial_logistic_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_multinomial_logistic_loss_layer.cpp.s +.PHONY : test_multinomial_logistic_loss_layer.cpp.s + +test_mvn_layer.o: test_mvn_layer.cpp.o +.PHONY : test_mvn_layer.o + +# target to build an object file +test_mvn_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.o +.PHONY : test_mvn_layer.cpp.o + +test_mvn_layer.i: test_mvn_layer.cpp.i +.PHONY : test_mvn_layer.i + +# target to preprocess a source file +test_mvn_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.i +.PHONY : test_mvn_layer.cpp.i + +test_mvn_layer.s: test_mvn_layer.cpp.s +.PHONY : test_mvn_layer.s + +# target to generate assembly for a file +test_mvn_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_mvn_layer.cpp.s +.PHONY : test_mvn_layer.cpp.s + +test_net.o: test_net.cpp.o +.PHONY : test_net.o + +# target to build an object file +test_net.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.o +.PHONY : test_net.cpp.o + +test_net.i: test_net.cpp.i +.PHONY : test_net.i + +# target to preprocess a source file +test_net.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.i +.PHONY : test_net.cpp.i + +test_net.s: test_net.cpp.s +.PHONY : test_net.s + +# target to generate assembly for a file +test_net.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_net.cpp.s +.PHONY : test_net.cpp.s + +test_neuron_layer.o: test_neuron_layer.cpp.o +.PHONY : test_neuron_layer.o + +# target to build an object file +test_neuron_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.o +.PHONY : test_neuron_layer.cpp.o + +test_neuron_layer.i: test_neuron_layer.cpp.i +.PHONY : test_neuron_layer.i + +# target to preprocess a source file +test_neuron_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.i +.PHONY : test_neuron_layer.cpp.i + +test_neuron_layer.s: test_neuron_layer.cpp.s +.PHONY : test_neuron_layer.s + +# target to generate assembly for a file +test_neuron_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_neuron_layer.cpp.s +.PHONY : test_neuron_layer.cpp.s + +test_platform.o: test_platform.cpp.o +.PHONY : test_platform.o + +# target to build an object file +test_platform.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.o +.PHONY : test_platform.cpp.o + +test_platform.i: test_platform.cpp.i +.PHONY : test_platform.i + +# target to preprocess a source file +test_platform.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.i +.PHONY : test_platform.cpp.i + +test_platform.s: test_platform.cpp.s +.PHONY : test_platform.s + +# target to generate assembly for a file +test_platform.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_platform.cpp.s +.PHONY : test_platform.cpp.s + +test_pooling_layer.o: test_pooling_layer.cpp.o +.PHONY : test_pooling_layer.o + +# target to build an object file +test_pooling_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.o +.PHONY : test_pooling_layer.cpp.o + +test_pooling_layer.i: test_pooling_layer.cpp.i +.PHONY : test_pooling_layer.i + +# target to preprocess a source file +test_pooling_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.i +.PHONY : test_pooling_layer.cpp.i + +test_pooling_layer.s: test_pooling_layer.cpp.s +.PHONY : test_pooling_layer.s + +# target to generate assembly for a file +test_pooling_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_pooling_layer.cpp.s +.PHONY : test_pooling_layer.cpp.s + +test_power_layer.o: test_power_layer.cpp.o +.PHONY : test_power_layer.o + +# target to build an object file +test_power_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.o +.PHONY : test_power_layer.cpp.o + +test_power_layer.i: test_power_layer.cpp.i +.PHONY : test_power_layer.i + +# target to preprocess a source file +test_power_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.i +.PHONY : test_power_layer.cpp.i + +test_power_layer.s: test_power_layer.cpp.s +.PHONY : test_power_layer.s + +# target to generate assembly for a file +test_power_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_power_layer.cpp.s +.PHONY : test_power_layer.cpp.s + +test_protobuf.o: test_protobuf.cpp.o +.PHONY : test_protobuf.o + +# target to build an object file +test_protobuf.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.o +.PHONY : test_protobuf.cpp.o + +test_protobuf.i: test_protobuf.cpp.i +.PHONY : test_protobuf.i + +# target to preprocess a source file +test_protobuf.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.i +.PHONY : test_protobuf.cpp.i + +test_protobuf.s: test_protobuf.cpp.s +.PHONY : test_protobuf.s + +# target to generate assembly for a file +test_protobuf.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_protobuf.cpp.s +.PHONY : test_protobuf.cpp.s + +test_random_number_generator.o: test_random_number_generator.cpp.o +.PHONY : test_random_number_generator.o + +# target to build an object file +test_random_number_generator.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.o +.PHONY : test_random_number_generator.cpp.o + +test_random_number_generator.i: test_random_number_generator.cpp.i +.PHONY : test_random_number_generator.i + +# target to preprocess a source file +test_random_number_generator.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.i +.PHONY : test_random_number_generator.cpp.i + +test_random_number_generator.s: test_random_number_generator.cpp.s +.PHONY : test_random_number_generator.s + +# target to generate assembly for a file +test_random_number_generator.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_random_number_generator.cpp.s +.PHONY : test_random_number_generator.cpp.s + +test_reduction_layer.o: test_reduction_layer.cpp.o +.PHONY : test_reduction_layer.o + +# target to build an object file +test_reduction_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.o +.PHONY : test_reduction_layer.cpp.o + +test_reduction_layer.i: test_reduction_layer.cpp.i +.PHONY : test_reduction_layer.i + +# target to preprocess a source file +test_reduction_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.i +.PHONY : test_reduction_layer.cpp.i + +test_reduction_layer.s: test_reduction_layer.cpp.s +.PHONY : test_reduction_layer.s + +# target to generate assembly for a file +test_reduction_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reduction_layer.cpp.s +.PHONY : test_reduction_layer.cpp.s + +test_reshape_layer.o: test_reshape_layer.cpp.o +.PHONY : test_reshape_layer.o + +# target to build an object file +test_reshape_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.o +.PHONY : test_reshape_layer.cpp.o + +test_reshape_layer.i: test_reshape_layer.cpp.i +.PHONY : test_reshape_layer.i + +# target to preprocess a source file +test_reshape_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.i +.PHONY : test_reshape_layer.cpp.i + +test_reshape_layer.s: test_reshape_layer.cpp.s +.PHONY : test_reshape_layer.s + +# target to generate assembly for a file +test_reshape_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_reshape_layer.cpp.s +.PHONY : test_reshape_layer.cpp.s + +test_sigmoid_cross_entropy_loss_layer.o: test_sigmoid_cross_entropy_loss_layer.cpp.o +.PHONY : test_sigmoid_cross_entropy_loss_layer.o + +# target to build an object file +test_sigmoid_cross_entropy_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.o +.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.o + +test_sigmoid_cross_entropy_loss_layer.i: test_sigmoid_cross_entropy_loss_layer.cpp.i +.PHONY : test_sigmoid_cross_entropy_loss_layer.i + +# target to preprocess a source file +test_sigmoid_cross_entropy_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.i +.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.i + +test_sigmoid_cross_entropy_loss_layer.s: test_sigmoid_cross_entropy_loss_layer.cpp.s +.PHONY : test_sigmoid_cross_entropy_loss_layer.s + +# target to generate assembly for a file +test_sigmoid_cross_entropy_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_sigmoid_cross_entropy_loss_layer.cpp.s +.PHONY : test_sigmoid_cross_entropy_loss_layer.cpp.s + +test_slice_layer.o: test_slice_layer.cpp.o +.PHONY : test_slice_layer.o + +# target to build an object file +test_slice_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.o +.PHONY : test_slice_layer.cpp.o + +test_slice_layer.i: test_slice_layer.cpp.i +.PHONY : test_slice_layer.i + +# target to preprocess a source file +test_slice_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.i +.PHONY : test_slice_layer.cpp.i + +test_slice_layer.s: test_slice_layer.cpp.s +.PHONY : test_slice_layer.s + +# target to generate assembly for a file +test_slice_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_slice_layer.cpp.s +.PHONY : test_slice_layer.cpp.s + +test_softmax_layer.o: test_softmax_layer.cpp.o +.PHONY : test_softmax_layer.o + +# target to build an object file +test_softmax_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.o +.PHONY : test_softmax_layer.cpp.o + +test_softmax_layer.i: test_softmax_layer.cpp.i +.PHONY : test_softmax_layer.i + +# target to preprocess a source file +test_softmax_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.i +.PHONY : test_softmax_layer.cpp.i + +test_softmax_layer.s: test_softmax_layer.cpp.s +.PHONY : test_softmax_layer.s + +# target to generate assembly for a file +test_softmax_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_layer.cpp.s +.PHONY : test_softmax_layer.cpp.s + +test_softmax_with_loss_layer.o: test_softmax_with_loss_layer.cpp.o +.PHONY : test_softmax_with_loss_layer.o + +# target to build an object file +test_softmax_with_loss_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.o +.PHONY : test_softmax_with_loss_layer.cpp.o + +test_softmax_with_loss_layer.i: test_softmax_with_loss_layer.cpp.i +.PHONY : test_softmax_with_loss_layer.i + +# target to preprocess a source file +test_softmax_with_loss_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.i +.PHONY : test_softmax_with_loss_layer.cpp.i + +test_softmax_with_loss_layer.s: test_softmax_with_loss_layer.cpp.s +.PHONY : test_softmax_with_loss_layer.s + +# target to generate assembly for a file +test_softmax_with_loss_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_softmax_with_loss_layer.cpp.s +.PHONY : test_softmax_with_loss_layer.cpp.s + +test_solver.o: test_solver.cpp.o +.PHONY : test_solver.o + +# target to build an object file +test_solver.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.o +.PHONY : test_solver.cpp.o + +test_solver.i: test_solver.cpp.i +.PHONY : test_solver.i + +# target to preprocess a source file +test_solver.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.i +.PHONY : test_solver.cpp.i + +test_solver.s: test_solver.cpp.s +.PHONY : test_solver.s + +# target to generate assembly for a file +test_solver.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_solver.cpp.s +.PHONY : test_solver.cpp.s + +test_split_layer.o: test_split_layer.cpp.o +.PHONY : test_split_layer.o + +# target to build an object file +test_split_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.o +.PHONY : test_split_layer.cpp.o + +test_split_layer.i: test_split_layer.cpp.i +.PHONY : test_split_layer.i + +# target to preprocess a source file +test_split_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.i +.PHONY : test_split_layer.cpp.i + +test_split_layer.s: test_split_layer.cpp.s +.PHONY : test_split_layer.s + +# target to generate assembly for a file +test_split_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_split_layer.cpp.s +.PHONY : test_split_layer.cpp.s + +test_spp_layer.o: test_spp_layer.cpp.o +.PHONY : test_spp_layer.o + +# target to build an object file +test_spp_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.o +.PHONY : test_spp_layer.cpp.o + +test_spp_layer.i: test_spp_layer.cpp.i +.PHONY : test_spp_layer.i + +# target to preprocess a source file +test_spp_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.i +.PHONY : test_spp_layer.cpp.i + +test_spp_layer.s: test_spp_layer.cpp.s +.PHONY : test_spp_layer.s + +# target to generate assembly for a file +test_spp_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_spp_layer.cpp.s +.PHONY : test_spp_layer.cpp.s + +test_stochastic_pooling.o: test_stochastic_pooling.cpp.o +.PHONY : test_stochastic_pooling.o + +# target to build an object file +test_stochastic_pooling.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.o +.PHONY : test_stochastic_pooling.cpp.o + +test_stochastic_pooling.i: test_stochastic_pooling.cpp.i +.PHONY : test_stochastic_pooling.i + +# target to preprocess a source file +test_stochastic_pooling.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.i +.PHONY : test_stochastic_pooling.cpp.i + +test_stochastic_pooling.s: test_stochastic_pooling.cpp.s +.PHONY : test_stochastic_pooling.s + +# target to generate assembly for a file +test_stochastic_pooling.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_stochastic_pooling.cpp.s +.PHONY : test_stochastic_pooling.cpp.s + +test_syncedmem.o: test_syncedmem.cpp.o +.PHONY : test_syncedmem.o + +# target to build an object file +test_syncedmem.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.o +.PHONY : test_syncedmem.cpp.o + +test_syncedmem.i: test_syncedmem.cpp.i +.PHONY : test_syncedmem.i + +# target to preprocess a source file +test_syncedmem.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.i +.PHONY : test_syncedmem.cpp.i + +test_syncedmem.s: test_syncedmem.cpp.s +.PHONY : test_syncedmem.s + +# target to generate assembly for a file +test_syncedmem.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_syncedmem.cpp.s +.PHONY : test_syncedmem.cpp.s + +test_tanh_layer.o: test_tanh_layer.cpp.o +.PHONY : test_tanh_layer.o + +# target to build an object file +test_tanh_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.o +.PHONY : test_tanh_layer.cpp.o + +test_tanh_layer.i: test_tanh_layer.cpp.i +.PHONY : test_tanh_layer.i + +# target to preprocess a source file +test_tanh_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.i +.PHONY : test_tanh_layer.cpp.i + +test_tanh_layer.s: test_tanh_layer.cpp.s +.PHONY : test_tanh_layer.s + +# target to generate assembly for a file +test_tanh_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_tanh_layer.cpp.s +.PHONY : test_tanh_layer.cpp.s + +test_threshold_layer.o: test_threshold_layer.cpp.o +.PHONY : test_threshold_layer.o + +# target to build an object file +test_threshold_layer.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.o +.PHONY : test_threshold_layer.cpp.o + +test_threshold_layer.i: test_threshold_layer.cpp.i +.PHONY : test_threshold_layer.i + +# target to preprocess a source file +test_threshold_layer.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.i +.PHONY : test_threshold_layer.cpp.i + +test_threshold_layer.s: test_threshold_layer.cpp.s +.PHONY : test_threshold_layer.s + +# target to generate assembly for a file +test_threshold_layer.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_threshold_layer.cpp.s +.PHONY : test_threshold_layer.cpp.s + +test_upgrade_proto.o: test_upgrade_proto.cpp.o +.PHONY : test_upgrade_proto.o + +# target to build an object file +test_upgrade_proto.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.o +.PHONY : test_upgrade_proto.cpp.o + +test_upgrade_proto.i: test_upgrade_proto.cpp.i +.PHONY : test_upgrade_proto.i + +# target to preprocess a source file +test_upgrade_proto.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.i +.PHONY : test_upgrade_proto.cpp.i + +test_upgrade_proto.s: test_upgrade_proto.cpp.s +.PHONY : test_upgrade_proto.s + +# target to generate assembly for a file +test_upgrade_proto.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_upgrade_proto.cpp.s +.PHONY : test_upgrade_proto.cpp.s + +test_util_blas.o: test_util_blas.cpp.o +.PHONY : test_util_blas.o + +# target to build an object file +test_util_blas.cpp.o: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.o +.PHONY : test_util_blas.cpp.o + +test_util_blas.i: test_util_blas.cpp.i +.PHONY : test_util_blas.i + +# target to preprocess a source file +test_util_blas.cpp.i: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.i +.PHONY : test_util_blas.cpp.i + +test_util_blas.s: test_util_blas.cpp.s +.PHONY : test_util_blas.s + +# target to generate assembly for a file +test_util_blas.cpp.s: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(MAKE) -f src/caffe/test/CMakeFiles/test.testbin.dir/build.make src/caffe/test/CMakeFiles/test.testbin.dir/test_util_blas.cpp.s +.PHONY : test_util_blas.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... install" + @echo "... install/local" + @echo "... install/strip" + @echo "... list_install_components" + @echo "... rebuild_cache" + @echo "... runtest" + @echo "... test.testbin" + @echo "... test_accuracy_layer.o" + @echo "... test_accuracy_layer.i" + @echo "... test_accuracy_layer.s" + @echo "... test_argmax_layer.o" + @echo "... test_argmax_layer.i" + @echo "... test_argmax_layer.s" + @echo "... test_benchmark.o" + @echo "... test_benchmark.i" + @echo "... test_benchmark.s" + @echo "... test_blob.o" + @echo "... test_blob.i" + @echo "... test_blob.s" + @echo "... test_caffe_main.o" + @echo "... test_caffe_main.i" + @echo "... test_caffe_main.s" + @echo "... test_common.o" + @echo "... test_common.i" + @echo "... test_common.s" + @echo "... test_concat_layer.o" + @echo "... test_concat_layer.i" + @echo "... test_concat_layer.s" + @echo "... test_contrastive_loss_layer.o" + @echo "... test_contrastive_loss_layer.i" + @echo "... test_contrastive_loss_layer.s" + @echo "... test_convolution_layer.o" + @echo "... test_convolution_layer.i" + @echo "... test_convolution_layer.s" + @echo "... test_data_layer.o" + @echo "... test_data_layer.i" + @echo "... test_data_layer.s" + @echo "... test_data_transformer.o" + @echo "... test_data_transformer.i" + @echo "... test_data_transformer.s" + @echo "... test_db.o" + @echo "... test_db.i" + @echo "... test_db.s" + @echo "... test_deconvolution_layer.o" + @echo "... test_deconvolution_layer.i" + @echo "... test_deconvolution_layer.s" + @echo "... test_dummy_data_layer.o" + @echo "... test_dummy_data_layer.i" + @echo "... test_dummy_data_layer.s" + @echo "... test_eltwise_layer.o" + @echo "... test_eltwise_layer.i" + @echo "... test_eltwise_layer.s" + @echo "... test_euclidean_loss_layer.o" + @echo "... test_euclidean_loss_layer.i" + @echo "... test_euclidean_loss_layer.s" + @echo "... test_filler.o" + @echo "... test_filler.i" + @echo "... test_filler.s" + @echo "... test_filter_layer.o" + @echo "... test_filter_layer.i" + @echo "... test_filter_layer.s" + @echo "... test_flatten_layer.o" + @echo "... test_flatten_layer.i" + @echo "... test_flatten_layer.s" + @echo "... test_gradient_based_solver.o" + @echo "... test_gradient_based_solver.i" + @echo "... test_gradient_based_solver.s" + @echo "... test_hdf5_output_layer.o" + @echo "... test_hdf5_output_layer.i" + @echo "... test_hdf5_output_layer.s" + @echo "... test_hdf5data_layer.o" + @echo "... test_hdf5data_layer.i" + @echo "... test_hdf5data_layer.s" + @echo "... test_hinge_loss_layer.o" + @echo "... test_hinge_loss_layer.i" + @echo "... test_hinge_loss_layer.s" + @echo "... test_im2col_layer.o" + @echo "... test_im2col_layer.i" + @echo "... test_im2col_layer.s" + @echo "... test_image_data_layer.o" + @echo "... test_image_data_layer.i" + @echo "... test_image_data_layer.s" + @echo "... test_infogain_loss_layer.o" + @echo "... test_infogain_loss_layer.i" + @echo "... test_infogain_loss_layer.s" + @echo "... test_inner_product_layer.o" + @echo "... test_inner_product_layer.i" + @echo "... test_inner_product_layer.s" + @echo "... test_internal_thread.o" + @echo "... test_internal_thread.i" + @echo "... test_internal_thread.s" + @echo "... test_io.o" + @echo "... test_io.i" + @echo "... test_io.s" + @echo "... test_layer_factory.o" + @echo "... test_layer_factory.i" + @echo "... test_layer_factory.s" + @echo "... test_lrn_layer.o" + @echo "... test_lrn_layer.i" + @echo "... test_lrn_layer.s" + @echo "... test_math_functions.o" + @echo "... test_math_functions.i" + @echo "... test_math_functions.s" + @echo "... test_maxpool_dropout_layers.o" + @echo "... test_maxpool_dropout_layers.i" + @echo "... test_maxpool_dropout_layers.s" + @echo "... test_memory_data_layer.o" + @echo "... test_memory_data_layer.i" + @echo "... test_memory_data_layer.s" + @echo "... test_multinomial_logistic_loss_layer.o" + @echo "... test_multinomial_logistic_loss_layer.i" + @echo "... test_multinomial_logistic_loss_layer.s" + @echo "... test_mvn_layer.o" + @echo "... test_mvn_layer.i" + @echo "... test_mvn_layer.s" + @echo "... test_net.o" + @echo "... test_net.i" + @echo "... test_net.s" + @echo "... test_neuron_layer.o" + @echo "... test_neuron_layer.i" + @echo "... test_neuron_layer.s" + @echo "... test_platform.o" + @echo "... test_platform.i" + @echo "... test_platform.s" + @echo "... test_pooling_layer.o" + @echo "... test_pooling_layer.i" + @echo "... test_pooling_layer.s" + @echo "... test_power_layer.o" + @echo "... test_power_layer.i" + @echo "... test_power_layer.s" + @echo "... test_protobuf.o" + @echo "... test_protobuf.i" + @echo "... test_protobuf.s" + @echo "... test_random_number_generator.o" + @echo "... test_random_number_generator.i" + @echo "... test_random_number_generator.s" + @echo "... test_reduction_layer.o" + @echo "... test_reduction_layer.i" + @echo "... test_reduction_layer.s" + @echo "... test_reshape_layer.o" + @echo "... test_reshape_layer.i" + @echo "... test_reshape_layer.s" + @echo "... test_sigmoid_cross_entropy_loss_layer.o" + @echo "... test_sigmoid_cross_entropy_loss_layer.i" + @echo "... test_sigmoid_cross_entropy_loss_layer.s" + @echo "... test_slice_layer.o" + @echo "... test_slice_layer.i" + @echo "... test_slice_layer.s" + @echo "... test_softmax_layer.o" + @echo "... test_softmax_layer.i" + @echo "... test_softmax_layer.s" + @echo "... test_softmax_with_loss_layer.o" + @echo "... test_softmax_with_loss_layer.i" + @echo "... test_softmax_with_loss_layer.s" + @echo "... test_solver.o" + @echo "... test_solver.i" + @echo "... test_solver.s" + @echo "... test_split_layer.o" + @echo "... test_split_layer.i" + @echo "... test_split_layer.s" + @echo "... test_spp_layer.o" + @echo "... test_spp_layer.i" + @echo "... test_spp_layer.s" + @echo "... test_stochastic_pooling.o" + @echo "... test_stochastic_pooling.i" + @echo "... test_stochastic_pooling.s" + @echo "... test_syncedmem.o" + @echo "... test_syncedmem.i" + @echo "... test_syncedmem.s" + @echo "... test_tanh_layer.o" + @echo "... test_tanh_layer.i" + @echo "... test_tanh_layer.s" + @echo "... test_threshold_layer.o" + @echo "... test_threshold_layer.i" + @echo "... test_threshold_layer.s" + @echo "... test_upgrade_proto.o" + @echo "... test_upgrade_proto.i" + @echo "... test_upgrade_proto.s" + @echo "... test_util_blas.o" + @echo "... test_util_blas.i" + @echo "... test_util_blas.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /home/yugao/caffe-merge-junli/caffe-yb/caffe && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/src/caffe/test/cmake_install.cmake b/src/caffe/test/cmake_install.cmake new file mode 100644 index 00000000..fa890cd7 --- /dev/null +++ b/src/caffe/test/cmake_install.cmake @@ -0,0 +1,34 @@ +# Install script for directory: /home/yugao/caffe-merge-junli/caffe-yb/caffe/src/caffe/test + +# Set the install prefix +IF(NOT DEFINED CMAKE_INSTALL_PREFIX) + SET(CMAKE_INSTALL_PREFIX "/home/yugao/caffe-merge-junli/caffe-yb/caffe/install") +ENDIF(NOT DEFINED CMAKE_INSTALL_PREFIX) +STRING(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") + +# Set the install configuration name. +IF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) + IF(BUILD_TYPE) + STRING(REGEX REPLACE "^[^A-Za-z0-9_]+" "" + CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}") + ELSE(BUILD_TYPE) + SET(CMAKE_INSTALL_CONFIG_NAME "Release") + ENDIF(BUILD_TYPE) + MESSAGE(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"") +ENDIF(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) + +# Set the component getting installed. +IF(NOT CMAKE_INSTALL_COMPONENT) + IF(COMPONENT) + MESSAGE(STATUS "Install component: \"${COMPONENT}\"") + SET(CMAKE_INSTALL_COMPONENT "${COMPONENT}") + ELSE(COMPONENT) + SET(CMAKE_INSTALL_COMPONENT) + ENDIF(COMPONENT) +ENDIF(NOT CMAKE_INSTALL_COMPONENT) + +# Install shared libraries without execute permission? +IF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) + SET(CMAKE_INSTALL_SO_NO_EXE "1") +ENDIF(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) + diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp index c8caf5ac..32643b3b 100644 --- a/src/caffe/test/test_caffe_main.cpp +++ b/src/caffe/test/test_caffe_main.cpp @@ -2,38 +2,27 @@ // to allow a main function to be compiled into the binary. #include "caffe/caffe.hpp" +#include "caffe/common.hpp" #include "caffe/test/test_caffe_main.hpp" -namespace caffe { -#ifndef CPU_ONLY - cudaDeviceProp CAFFE_TEST_CUDA_PROP; -#endif -} - -#ifndef CPU_ONLY -using caffe::CAFFE_TEST_CUDA_PROP; -#endif int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); caffe::GlobalInit(&argc, &argv); #ifndef CPU_ONLY // Before starting testing, let's first print out a few cuda defice info. - int device; - cudaGetDeviceCount(&device); - cout << "Cuda number of devices: " << device << endl; + int device = 0; if (argc > 1) { // Use the given device device = atoi(argv[1]); - cudaSetDevice(device); + caffe::amdDevice.Init(device); cout << "Setting to use device " << device << endl; - } else if (CUDA_TEST_DEVICE >= 0) { + } else if (OPENCL_TEST_DEVICE >= 0) { // Use the device assigned in build configuration; but with a lower priority - device = CUDA_TEST_DEVICE; + device = OPENCL_TEST_DEVICE; } - cudaGetDevice(&device); cout << "Current device id: " << device << endl; - cudaGetDeviceProperties(&CAFFE_TEST_CUDA_PROP, device); + caffe::amdDevice.Init(); #endif // invoke the test. return RUN_ALL_TESTS(); diff --git a/src/caffe/test/test_common.cpp b/src/caffe/test/test_common.cpp index b3a61b0f..6c80de1d 100644 --- a/src/caffe/test/test_common.cpp +++ b/src/caffe/test/test_common.cpp @@ -14,12 +14,13 @@ class CommonTest : public ::testing::Test {}; #ifndef CPU_ONLY // GPU Caffe singleton test. +/* TEST_F(CommonTest, TestCublasHandlerGPU) { int cuda_device_id; CUDA_CHECK(cudaGetDevice(&cuda_device_id)); EXPECT_TRUE(Caffe::cublas_handle()); } - +*/ #endif TEST_F(CommonTest, TestBrewMode) { @@ -45,7 +46,7 @@ TEST_F(CommonTest, TestRandSeedCPU) { } #ifndef CPU_ONLY // GPU Caffe singleton test. - +/* TEST_F(CommonTest, TestRandSeedGPU) { SyncedMemory data_a(10 * sizeof(unsigned int)); SyncedMemory data_b(10 * sizeof(unsigned int)); @@ -60,7 +61,7 @@ TEST_F(CommonTest, TestRandSeedGPU) { ((const unsigned int*)(data_b.cpu_data()))[i]); } } - +*/ #endif } // namespace caffe diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp index 67d41fff..576095c1 100644 --- a/src/caffe/test/test_convolution_layer.cpp +++ b/src/caffe/test/test_convolution_layer.cpp @@ -122,10 +122,11 @@ class ConvolutionLayerTest : public MultiDeviceTest { } virtual ~ConvolutionLayerTest() { - delete blob_bottom_; + /* delete blob_bottom_; delete blob_bottom_2_; delete blob_top_; delete blob_top_2_; + */ } virtual Blob* MakeReferenceTop(Blob* top) { diff --git a/src/caffe/test/test_data/generate_sample_data.py b/src/caffe/test/test_data/generate_sample_data.py old mode 100644 new mode 100755 diff --git a/src/caffe/test/test_filter_layer.cpp b/src/caffe/test/test_filter_layer.cpp index c641b6ef..801881e9 100644 --- a/src/caffe/test/test_filter_layer.cpp +++ b/src/caffe/test/test_filter_layer.cpp @@ -13,7 +13,7 @@ #include "caffe/test/test_gradient_check_util.hpp" namespace caffe { - +/* template class FilterLayerTest : public MultiDeviceTest { typedef typename TypeParam::Dtype Dtype; @@ -124,5 +124,5 @@ TYPED_TEST(FilterLayerTest, TestGradient) { checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, this->blob_top_vec_, 0); } - +*/ } // namespace caffe diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp index c03df173..7913b49c 100644 --- a/src/caffe/test/test_inner_product_layer.cpp +++ b/src/caffe/test/test_inner_product_layer.cpp @@ -13,9 +13,9 @@ namespace caffe { -#ifndef CPU_ONLY -extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; -#endif +//#ifndef CPU_ONLY +//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; +//#endif template class InnerProductLayerTest : public MultiDeviceTest { @@ -57,12 +57,8 @@ TYPED_TEST(InnerProductLayerTest, TestSetUp) { TYPED_TEST(InnerProductLayerTest, TestForward) { typedef typename TypeParam::Dtype Dtype; - bool IS_VALID_CUDA = false; -#ifndef CPU_ONLY - IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; -#endif if (Caffe::mode() == Caffe::CPU || - sizeof(Dtype) == 4 || IS_VALID_CUDA) { + sizeof(Dtype) == 4 ) { LayerParameter layer_param; InnerProductParameter* inner_product_param = layer_param.mutable_inner_product_param(); @@ -87,12 +83,8 @@ TYPED_TEST(InnerProductLayerTest, TestForward) { TYPED_TEST(InnerProductLayerTest, TestGradient) { typedef typename TypeParam::Dtype Dtype; - bool IS_VALID_CUDA = false; -#ifndef CPU_ONLY - IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; -#endif if (Caffe::mode() == Caffe::CPU || - sizeof(Dtype) == 4 || IS_VALID_CUDA) { + sizeof(Dtype) == 4 ) { LayerParameter layer_param; InnerProductParameter* inner_product_param = layer_param.mutable_inner_product_param(); diff --git a/src/caffe/test/test_math_functions.cpp b/src/caffe/test/test_math_functions.cpp index a095b544..a0f88065 100644 --- a/src/caffe/test/test_math_functions.cpp +++ b/src/caffe/test/test_math_functions.cpp @@ -232,7 +232,7 @@ TYPED_TEST(GPUMathFunctionsTest, TestCopy) { const int n = this->blob_bottom_->count(); const TypeParam* bottom_data = this->blob_bottom_->gpu_data(); TypeParam* top_data = this->blob_top_->mutable_gpu_data(); - caffe_copy(n, bottom_data, top_data); + caffe_gpu_copy(n, bottom_data, top_data); bottom_data = this->blob_bottom_->cpu_data(); top_data = this->blob_top_->mutable_cpu_data(); for (int i = 0; i < n; ++i) { diff --git a/src/caffe/test/test_platform.cpp b/src/caffe/test/test_platform.cpp index f3513e08..7a30c2db 100644 --- a/src/caffe/test/test_platform.cpp +++ b/src/caffe/test/test_platform.cpp @@ -10,10 +10,10 @@ namespace caffe { -extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; +//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; class PlatformTest : public ::testing::Test {}; - +/* TEST_F(PlatformTest, TestInitialization) { printf("Major revision number: %d\n", CAFFE_TEST_CUDA_PROP.major); printf("Minor revision number: %d\n", CAFFE_TEST_CUDA_PROP.minor); @@ -51,7 +51,7 @@ TEST_F(PlatformTest, TestInitialization) { (CAFFE_TEST_CUDA_PROP.unifiedAddressing ? "Yes" : "No")); EXPECT_TRUE(true); } - +*/ } // namespace caffe #endif // CPU_ONLY diff --git a/src/caffe/test/test_util_blas.cpp b/src/caffe/test/test_util_blas.cpp index 8770f309..9cc9558c 100644 --- a/src/caffe/test/test_util_blas.cpp +++ b/src/caffe/test/test_util_blas.cpp @@ -12,7 +12,7 @@ namespace caffe { -extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; +//extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; template class GemmTest : public ::testing::Test {}; @@ -30,7 +30,8 @@ TYPED_TEST(GemmTest, TestGemmCPUGPU) { caffe_copy(6, data, A.mutable_cpu_data()); caffe_copy(12, data, B.mutable_cpu_data()); - if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) { + // if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) { + if (sizeof(TypeParam) == 4) { // [1, 2, 3; 4 5 6] * [1, 2, 3, 4; 5, 6, 7, 8; 9, 10, 11, 12]; caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, 2, 4, 3, 1., A.cpu_data(), B.cpu_data(), 0., C.mutable_cpu_data()); @@ -100,7 +101,8 @@ TYPED_TEST(GemmTest, TestGemvCPUGPU) { caffe_copy(6, data, A.mutable_cpu_data()); caffe_copy(3, data, x.mutable_cpu_data()); - if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) { + // if (sizeof(TypeParam) == 4 || CAFFE_TEST_CUDA_PROP.major >= 2) { + if (sizeof(TypeParam) == 4) { caffe_cpu_gemv(CblasNoTrans, 2, 3, 1., A.cpu_data(), x.cpu_data(), 0., y.mutable_cpu_data()); for (int i = 0; i < 2; ++i) { diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp index 1d269c35..2dcf0e5a 100644 --- a/src/caffe/util/benchmark.cpp +++ b/src/caffe/util/benchmark.cpp @@ -6,34 +6,16 @@ namespace caffe { Timer::Timer() - : initted_(false), - running_(false), - has_run_at_least_once_(false) { + : initted_(false), running_(false), has_run_at_least_once_(false) { Init(); } Timer::~Timer() { - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaEventDestroy(start_gpu_)); - CUDA_CHECK(cudaEventDestroy(stop_gpu_)); -#else - NO_GPU; -#endif - } } void Timer::Start() { if (!running()) { - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaEventRecord(start_gpu_, 0)); -#else - NO_GPU; -#endif - } else { - start_cpu_ = boost::posix_time::microsec_clock::local_time(); - } + start_cpu_ = boost::posix_time::microsec_clock::local_time(); running_ = true; has_run_at_least_once_ = true; } @@ -41,21 +23,11 @@ void Timer::Start() { void Timer::Stop() { if (running()) { - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaEventRecord(stop_gpu_, 0)); - CUDA_CHECK(cudaEventSynchronize(stop_gpu_)); -#else - NO_GPU; -#endif - } else { - stop_cpu_ = boost::posix_time::microsec_clock::local_time(); - } + stop_cpu_ = boost::posix_time::microsec_clock::local_time(); running_ = false; } } - float Timer::MicroSeconds() { if (!has_run_at_least_once()) { LOG(WARNING) << "Timer has never been run before reading time."; @@ -64,18 +36,8 @@ float Timer::MicroSeconds() { if (running()) { Stop(); } - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, - stop_gpu_)); - // Cuda only measure milliseconds - elapsed_microseconds_ = elapsed_milliseconds_ * 1000; -#else - NO_GPU; -#endif - } else { - elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds(); - } + + elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds(); return elapsed_microseconds_; } @@ -87,16 +49,8 @@ float Timer::MilliSeconds() { if (running()) { Stop(); } - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, - stop_gpu_)); -#else - NO_GPU; -#endif - } else { - elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds(); - } + + elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds(); return elapsed_milliseconds_; } @@ -107,12 +61,6 @@ float Timer::Seconds() { void Timer::Init() { if (!initted()) { if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - CUDA_CHECK(cudaEventCreate(&start_gpu_)); - CUDA_CHECK(cudaEventCreate(&stop_gpu_)); -#else - NO_GPU; -#endif } initted_ = true; } @@ -147,8 +95,8 @@ float CPUTimer::MilliSeconds() { if (running()) { Stop(); } - this->elapsed_milliseconds_ = (this->stop_cpu_ - - this->start_cpu_).total_milliseconds(); + this->elapsed_milliseconds_ = + (this->stop_cpu_ - this->start_cpu_).total_milliseconds(); return this->elapsed_milliseconds_; } @@ -160,8 +108,8 @@ float CPUTimer::MicroSeconds() { if (running()) { Stop(); } - this->elapsed_microseconds_ = (this->stop_cpu_ - - this->start_cpu_).total_microseconds(); + this->elapsed_microseconds_ = + (this->stop_cpu_ - this->start_cpu_).total_microseconds(); return this->elapsed_microseconds_; } diff --git a/src/caffe/util/cudnn.cpp b/src/caffe/util/cudnn.cpp index 1772f009..592017c5 100644 --- a/src/caffe/util/cudnn.cpp +++ b/src/caffe/util/cudnn.cpp @@ -2,22 +2,22 @@ #include "caffe/util/cudnn.hpp" namespace caffe { -namespace cudnn { + namespace cudnn { -float dataType::oneval = 1.0; -float dataType::zeroval = 0.0; -const void* dataType::one = + float dataType::oneval = 1.0; + float dataType::zeroval = 0.0; + const void* dataType::one = static_cast(&dataType::oneval); -const void* dataType::zero = + const void* dataType::zero = static_cast(&dataType::zeroval); -double dataType::oneval = 1.0; -double dataType::zeroval = 0.0; -const void* dataType::one = + double dataType::oneval = 1.0; + double dataType::zeroval = 0.0; + const void* dataType::one = static_cast(&dataType::oneval); -const void* dataType::zero = + const void* dataType::zero = static_cast(&dataType::zeroval); -} // namespace cudnn + } // namespace cudnn } // namespace caffe #endif diff --git a/src/caffe/util/db.cpp b/src/caffe/util/db.cpp index f55420e9..fd4de1bf 100644 --- a/src/caffe/util/db.cpp +++ b/src/caffe/util/db.cpp @@ -4,7 +4,8 @@ #include -namespace caffe { namespace db { +namespace caffe { +namespace db { DB* GetDB(DataParameter::DB backend) { switch (backend) { diff --git a/src/caffe/util/db_leveldb.cpp b/src/caffe/util/db_leveldb.cpp index 06c46627..d8eac5f7 100644 --- a/src/caffe/util/db_leveldb.cpp +++ b/src/caffe/util/db_leveldb.cpp @@ -2,7 +2,8 @@ #include -namespace caffe { namespace db { +namespace caffe { +namespace db { void LevelDB::Open(const string& source, Mode mode) { leveldb::Options options; @@ -12,8 +13,8 @@ void LevelDB::Open(const string& source, Mode mode) { options.error_if_exists = mode == NEW; options.create_if_missing = mode != READ; leveldb::Status status = leveldb::DB::Open(options, source, &db_); - CHECK(status.ok()) << "Failed to open leveldb " << source - << std::endl << status.ToString(); + CHECK(status.ok()) << "Failed to open leveldb " << source << std::endl + << status.ToString(); LOG(INFO) << "Opened leveldb " << source; } diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp index a054b796..126b3790 100644 --- a/src/caffe/util/db_lmdb.cpp +++ b/src/caffe/util/db_lmdb.cpp @@ -4,14 +4,15 @@ #include -namespace caffe { namespace db { +namespace caffe { +namespace db { const size_t LMDB_MAP_SIZE = 1099511627776; // 1 TB void LMDB::Open(const string& source, Mode mode) { MDB_CHECK(mdb_env_create(&mdb_env_)); - MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE)); - if (mode == NEW) { + MDB_CHECK (mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));if +( mode == NEW) { CHECK_EQ(mkdir(source.c_str(), 0744), 0) << "mkdir " << source << "failed"; } int flags = 0; diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp index c48f31f3..e9c07970 100644 --- a/src/caffe/util/im2col.cpp +++ b/src/caffe/util/im2col.cpp @@ -1,18 +1,45 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #include #include #include +#include "caffe/common.hpp" #include "caffe/util/im2col.hpp" #include "caffe/util/math_functions.hpp" namespace caffe { +template extern std::string get_dtype_suffix(); + template -void im2col_cpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col) { +void im2col_cpu(const Dtype* data_im, const int channels, const int height, + const int width, const int kernel_h, const int kernel_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) { int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int channels_col = channels * kernel_h * kernel_w; @@ -25,8 +52,8 @@ void im2col_cpu(const Dtype* data_im, const int channels, int h_pad = h * stride_h - pad_h + h_offset; int w_pad = w * stride_w - pad_w + w_offset; if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) - data_col[(c * height_col + h) * width_col + w] = - data_im[(c_im * height + h_pad) * width + w_pad]; + data_col[(c * height_col + h) * width_col + w] = data_im[(c_im + * height + h_pad) * width + w_pad]; else data_col[(c * height_col + h) * width_col + w] = 0; } @@ -34,22 +61,19 @@ void im2col_cpu(const Dtype* data_im, const int channels, } } -// Explicit instantiation template void im2col_cpu(const float* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_col); + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_col); template void im2col_cpu(const double* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_col); + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_col); template -void col2im_cpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_im) { +void col2im_cpu(const Dtype* data_col, const int channels, const int height, + const int width, const int patch_h, const int patch_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) { caffe_set(height * width * channels, Dtype(0), data_im); int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; @@ -63,21 +87,214 @@ void col2im_cpu(const Dtype* data_col, const int channels, int h_pad = h * stride_h - pad_h + h_offset; int w_pad = w * stride_w - pad_w + w_offset; if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) - data_im[(c_im * height + h_pad) * width + w_pad] += - data_col[(c * height_col + h) * width_col + w]; + data_im[(c_im * height + h_pad) * width + w_pad] += data_col[(c + * height_col + h) * width_col + w]; } } } } -// Explicit instantiation template void col2im_cpu(const float* data_col, const int channels, const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_im); + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_im); template void col2im_cpu(const double* data_col, const int channels, const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_im); + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_im); + +#ifndef CPU_ONLY +template +void col2im_gpu_opt(const Dtype* data_col, const int col_offset, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im, const int img_offset, + int optnum) { + std::string kernel_name = "col2im_opt" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int num_kernels = channels * height * width * optnum; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 16, sizeof(cl_int), (void*) &optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void col2im_gpu_opt(const float* data_col, const int col_offset, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_im, const int img_offset, + int optnum); +template void col2im_gpu_opt(const double* data_col, + const int col_offset, const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_im, + const int img_offset, int optnum); + +template +void im2col_gpu(const Dtype* data_im, const int img_offset, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + Dtype* data_col, const int col_offset) { + std::string kernel_name = "im2col" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int num_kernels = channels * height_col * width_col; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_w); + + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &col_offset); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); + +} + +template void im2col_gpu(const float* data_im, const int img_offset, + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_col, const int col_offset); +template void im2col_gpu(const double* data_im, const int img_offset, + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_col, const int col_offset); + +template +void col2im_gpu(const Dtype* data_col, const int col_offset, const int channels, const int height, + const int width, const int patch_h, const int patch_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + Dtype* data_im, const int img_offset) { + std::string kernel_name = "col2im" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; + int num_kernels = channels * height * width; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &patch_h); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &patch_w); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &img_offset); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void col2im_gpu(const float* data_col, const int col_offset, + const int channels, const int height, const int width, const int patch_h, + const int patch_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, float* data_im, const int img_offset); +template void col2im_gpu(const double* data_col, const int col_offset, + const int channels, const int height, const int width, const int patch_h, + const int patch_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, double* data_im, const int img_offset); + +template +void im2col_gpu_opt(const Dtype* data_im, const int img_offset, + const int channels, const int height, const int width,const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col, const int col_offset, + int optnum) { + + std::string kernel_name = "im2col_opt" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int num_kernels = optnum * channels * height_col * width_col; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &img_offset); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &height_col); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &width_col); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &data_col); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &col_offset); + ret |= clSetKernelArg(Kernel, 16, sizeof(cl_int), (void*) &optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { (size_t)(256 - 256 % width_col) }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void im2col_gpu_opt(const float* data_im, const int img_offset, + const int channels, const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* data_col, const int col_offset, + int optnum); +template void im2col_gpu_opt(const double* data_im, + const int img_offset, const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, double* data_col, + const int col_offset, int optnum); +#endif } // namespace caffe diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu index c90f93eb..0848017a 100644 --- a/src/caffe/util/im2col.cu +++ b/src/caffe/util/im2col.cu @@ -32,7 +32,7 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im, int h = h_in + i; int w = w_in + j; *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? - data_im_ptr[i * width + j] : 0; + data_im_ptr[i * width + j] : 0; data_col_ptr += height_col * width_col; } } @@ -40,11 +40,9 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im, } template -void im2col_gpu(const Dtype* data_im, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - Dtype* data_col) { +void im2col_gpu(const Dtype* data_im, const int channels, const int height, + const int width, const int kernel_h, const int kernel_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) { // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; @@ -52,14 +50,13 @@ void im2col_gpu(const Dtype* data_im, const int channels, int num_kernels = channels * height_col * width_col; // NOLINT_NEXT_LINE(whitespace/operators) im2col_gpu_kernel<<>>( + CAFFE_CUDA_NUM_THREADS>>>( num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, height_col, width_col, data_col); CUDA_POST_KERNEL_CHECK; } - // Explicit instantiation template void im2col_gpu(const float* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, @@ -88,19 +85,9 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col, int w_col_end = min(w / stride_w + 1, width_col); int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1; int h_col_end = min(h / stride_h + 1, height_col); - /* - for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { - for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { - // the col location: [c * width * height + h_out, w_out] - int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize - + (w - w_col * stride_w); - val += data_col[(c_col * height_col + h_col) * width_col + w_col]; - } - } - */ // equivalent implementation int offset = - (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; + (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col; int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col; int coeff_w_col = (1 - stride_w * height_col * width_col); for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { @@ -113,10 +100,9 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col, } template -void col2im_gpu(const Dtype* data_col, const int channels, - const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, Dtype* data_im) { +void col2im_gpu(const Dtype* data_col, const int channels, const int height, + const int width, const int patch_h, const int patch_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) { int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; int num_kernels = channels * height * width; @@ -124,7 +110,7 @@ void col2im_gpu(const Dtype* data_col, const int channels, // bottom dimension, and then in the kernel add up the top dimensions. // NOLINT_NEXT_LINE(whitespace/operators) col2im_gpu_kernel<<>>( + CAFFE_CUDA_NUM_THREADS>>>( num_kernels, data_col, height, width, channels, patch_h, patch_w, pad_h, pad_w, stride_h, stride_w, height_col, width_col, data_im); @@ -134,11 +120,11 @@ void col2im_gpu(const Dtype* data_col, const int channels, // Explicit instantiation template void col2im_gpu(const float* data_col, const int channels, const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* data_im); + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + float* data_im); template void col2im_gpu(const double* data_col, const int channels, const int height, const int width, const int patch_h, const int patch_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, double* data_im); + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + double* data_im); } // namespace caffe diff --git a/src/caffe/util/insert_splits.cpp b/src/caffe/util/insert_splits.cpp index 416f80ab..7974b0ea 100644 --- a/src/caffe/util/insert_splits.cpp +++ b/src/caffe/util/insert_splits.cpp @@ -30,8 +30,8 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) { layer_idx_to_layer_name[i] = layer_param.name(); for (int j = 0; j < layer_param.bottom_size(); ++j) { const string& blob_name = layer_param.bottom(j); - if (blob_name_to_last_top_idx.find(blob_name) == - blob_name_to_last_top_idx.end()) { + if (blob_name_to_last_top_idx.find(blob_name) + == blob_name_to_last_top_idx.end()) { LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; } const pair& bottom_idx = make_pair(i, j); @@ -45,8 +45,8 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) { } // A use of a top blob as a loss should be handled similarly to the use of // a top blob as an input (bottom) blob to another layer. - const int last_loss = - std::min(layer_param.loss_weight_size(), layer_param.top_size()); + const int last_loss = std::min(layer_param.loss_weight_size(), + layer_param.top_size()); for (int j = 0; j < last_loss; ++j) { const string& blob_name = layer_param.top(j); const pair& top_idx = blob_name_to_last_top_idx[blob_name]; @@ -74,14 +74,15 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) { layer_param->CopyFrom(param.layer(i)); // Replace any shared bottom blobs with split layer outputs. for (int j = 0; j < layer_param->bottom_size(); ++j) { - const pair& top_idx = - bottom_idx_to_source_top_idx[make_pair(i, j)]; + const pair& top_idx = bottom_idx_to_source_top_idx[make_pair(i, + j)]; const int split_count = top_idx_to_bottom_count[top_idx]; if (split_count > 1) { const string& layer_name = layer_idx_to_layer_name[top_idx.first]; const string& blob_name = layer_param->bottom(j); - layer_param->set_bottom(j, SplitBlobName(layer_name, - blob_name, top_idx.second, top_idx_to_bottom_split_idx[top_idx]++)); + layer_param->set_bottom(j, + SplitBlobName(layer_name, blob_name, top_idx.second, + top_idx_to_bottom_split_idx[top_idx]++)); } } // Create split layer for any top blobs used by other layer as bottom @@ -94,8 +95,8 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) { const string& blob_name = layer_param->top(j); LayerParameter* split_layer_param = param_split->add_layer(); const float loss_weight = top_idx_to_loss_weight[top_idx]; - ConfigureSplitLayer(layer_name, blob_name, j, split_count, - loss_weight, split_layer_param); + ConfigureSplitLayer(layer_name, blob_name, j, split_count, loss_weight, + split_layer_param); if (loss_weight) { layer_param->clear_loss_weight(); top_idx_to_bottom_split_idx[top_idx]++; diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp index 77ef7f25..09824880 100644 --- a/src/caffe/util/io.cpp +++ b/src/caffe/util/io.cpp @@ -67,11 +67,10 @@ void WriteProtoToBinaryFile(const Message& proto, const char* filename) { CHECK(proto.SerializeToOstream(&output)); } -cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width, const bool is_color) { +cv::Mat ReadImageToCVMat(const string& filename, const int height, + const int width, const bool is_color) { cv::Mat cv_img; - int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : - CV_LOAD_IMAGE_GRAYSCALE); + int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); cv::Mat cv_img_origin = cv::imread(filename, cv_read_flag); if (!cv_img_origin.data) { LOG(ERROR) << "Could not open or find file " << filename; @@ -85,13 +84,12 @@ cv::Mat ReadImageToCVMat(const string& filename, return cv_img; } -cv::Mat ReadImageToCVMat(const string& filename, - const int height, const int width) { +cv::Mat ReadImageToCVMat(const string& filename, const int height, + const int width) { return ReadImageToCVMat(filename, height, width, true); } -cv::Mat ReadImageToCVMat(const string& filename, - const bool is_color) { +cv::Mat ReadImageToCVMat(const string& filename, const bool is_color) { return ReadImageToCVMat(filename, 0, 0, is_color); } @@ -99,31 +97,30 @@ cv::Mat ReadImageToCVMat(const string& filename) { return ReadImageToCVMat(filename, 0, 0, true); } // Do the file extension and encoding match? -static bool matchExt(const std::string & fn, - std::string en) { +static bool matchExt(const std::string & fn, std::string en) { size_t p = fn.rfind('.'); std::string ext = p != fn.npos ? fn.substr(p) : fn; std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); std::transform(en.begin(), en.end(), en.begin(), ::tolower); - if ( ext == en ) + if (ext == en) return true; - if ( en == "jpg" && ext == "jpeg" ) + if (en == "jpg" && ext == "jpeg") return true; return false; } -bool ReadImageToDatum(const string& filename, const int label, - const int height, const int width, const bool is_color, - const std::string & encoding, Datum* datum) { +bool ReadImageToDatum(const string& filename, const int label, const int height, + const int width, const bool is_color, const std::string & encoding, + Datum* datum) { cv::Mat cv_img = ReadImageToCVMat(filename, height, width, is_color); if (cv_img.data) { if (encoding.size()) { - if ( (cv_img.channels() == 3) == is_color && !height && !width && - matchExt(filename, encoding) ) + if ((cv_img.channels() == 3) == is_color && !height && !width + && matchExt(filename, encoding)) return ReadFileToDatum(filename, label, datum); - std::vector buf; - cv::imencode("."+encoding, cv_img, buf); - datum->set_data(std::string(reinterpret_cast(&buf[0]), - buf.size())); + std::vector < uchar > buf; + cv::imencode("." + encoding, cv_img, buf); + datum->set_data( + std::string(reinterpret_cast(&buf[0]), buf.size())); datum->set_label(label); datum->set_encoded(true); return true; @@ -136,11 +133,10 @@ bool ReadImageToDatum(const string& filename, const int label, } } -bool ReadFileToDatum(const string& filename, const int label, - Datum* datum) { +bool ReadFileToDatum(const string& filename, const int label, Datum* datum) { std::streampos size; - fstream file(filename.c_str(), ios::in|ios::binary|ios::ate); + fstream file(filename.c_str(), ios::in | ios::binary | ios::ate); if (file.is_open()) { size = file.tellg(); std::string buffer(size, ' '); @@ -172,8 +168,7 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color) { CHECK(datum.encoded()) << "Datum not encoded"; const string& data = datum.data(); std::vector vec_data(data.c_str(), data.c_str() + data.size()); - int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : - CV_LOAD_IMAGE_GRAYSCALE); + int cv_read_flag = (is_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); cv_img = cv::imdecode(vec_data, cv_read_flag); if (!cv_img.data) { LOG(ERROR) << "Could not decode datum "; @@ -216,7 +211,7 @@ void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) { int datum_size = datum_channels * datum_height * datum_width; std::string buffer(datum_size, ' '); for (int h = 0; h < datum_height; ++h) { - const uchar* ptr = cv_img.ptr(h); + const uchar* ptr = cv_img.ptr < uchar > (h); int img_index = 0; for (int w = 0; w < datum_width; ++w) { for (int c = 0; c < datum_channels; ++c) { @@ -230,9 +225,8 @@ void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) { // Verifies format of data stored in HDF5 file and reshapes blob accordingly. template -void hdf5_load_nd_dataset_helper( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob) { +void hdf5_load_nd_dataset_helper(hid_t file_id, const char* dataset_name_, + int min_dim, int max_dim, Blob* blob) { // Verify that the dataset exists. CHECK(H5LTfind_dataset(file_id, dataset_name_)) << "Failed to find HDF5 dataset " << dataset_name_; @@ -245,10 +239,10 @@ void hdf5_load_nd_dataset_helper( CHECK_LE(ndims, max_dim); // Verify that the data format is what we expect: float or double. - std::vector dims(ndims); + std::vector < hsize_t > dims(ndims); H5T_class_t class_; - status = H5LTget_dataset_info( - file_id, dataset_name_, dims.data(), &class_, NULL); + status = H5LTget_dataset_info(file_id, dataset_name_, dims.data(), &class_, + NULL); CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_; CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data"; @@ -261,45 +255,45 @@ void hdf5_load_nd_dataset_helper( template <> void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, - int min_dim, int max_dim, Blob* blob) { + int min_dim, int max_dim, Blob* blob) { hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); - herr_t status = H5LTread_dataset_float( - file_id, dataset_name_, blob->mutable_cpu_data()); + herr_t status = H5LTread_dataset_float(file_id, dataset_name_, + blob->mutable_cpu_data()); CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_; } template <> void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, - int min_dim, int max_dim, Blob* blob) { + int min_dim, int max_dim, Blob* blob) { hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob); - herr_t status = H5LTread_dataset_double( - file_id, dataset_name_, blob->mutable_cpu_data()); + herr_t status = H5LTread_dataset_double(file_id, dataset_name_, + blob->mutable_cpu_data()); CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_; } template <> -void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob) { +void hdf5_save_nd_dataset(const hid_t file_id, + const string& dataset_name, const Blob& blob) { hsize_t dims[HDF5_NUM_DIMS]; dims[0] = blob.num(); dims[1] = blob.channels(); dims[2] = blob.height(); dims[3] = blob.width(); - herr_t status = H5LTmake_dataset_float( - file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); + herr_t status = H5LTmake_dataset_float(file_id, dataset_name.c_str(), + HDF5_NUM_DIMS, dims, blob.cpu_data()); CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name; } template <> -void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob) { +void hdf5_save_nd_dataset(const hid_t file_id, + const string& dataset_name, const Blob& blob) { hsize_t dims[HDF5_NUM_DIMS]; dims[0] = blob.num(); dims[1] = blob.channels(); dims[2] = blob.height(); dims[3] = blob.width(); - herr_t status = H5LTmake_dataset_double( - file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); + herr_t status = H5LTmake_dataset_double(file_id, dataset_name.c_str(), + HDF5_NUM_DIMS, dims, blob.cpu_data()); CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name; } diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 0aab6b17..e45fd564 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -1,3 +1,29 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + #include #include @@ -6,29 +32,34 @@ #include "caffe/common.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/util/rng.hpp" +#include "caffe/util/ocl_util.hpp" +#include "caffe/util/ocl_wrapper.hpp" + +static const clblasOrder order = clblasColumnMajor; +#define pi 3.1415926 namespace caffe { -template<> +template <> void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C) { int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, - ldb, beta, C, N); + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta, C, N); } -template<> +template <> void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C) { int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, - ldb, beta, C, N); + cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta, C, N); } template <> @@ -47,16 +78,20 @@ void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, template <> void caffe_axpy(const int N, const float alpha, const float* X, - float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); } + float* Y) { + cblas_saxpy(N, alpha, X, 1, Y, 1); +} template <> void caffe_axpy(const int N, const double alpha, const double* X, - double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); } + double* Y) { + cblas_daxpy(N, alpha, X, 1, Y, 1); +} -template -void caffe_set(const int N, const Dtype alpha, Dtype* Y) { +template <> +void caffe_set(const int N, const float alpha, float* Y) { if (alpha == 0) { - memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + memset(Y, 0, sizeof(float) * N); return; } for (int i = 0; i < N; ++i) { @@ -64,10 +99,28 @@ void caffe_set(const int N, const Dtype alpha, Dtype* Y) { } } -template void caffe_set(const int N, const int alpha, int* Y); -template void caffe_set(const int N, const float alpha, float* Y); -template void caffe_set(const int N, const double alpha, double* Y); +template <> +void caffe_set(const int N, const double alpha, double* Y) { + if (alpha == 0) { + memset(Y, 0, sizeof(double) * N); + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } +} + +/* +template <> +void caffe_copy(const int N, const float* X, float* Y) { + cblas_scopy(N, X, 1, Y, 1); +} +template <> +void caffe_copy(const int N, const double* X, double* Y) { + cblas_dcopy(N, X, 1, Y, 1); +} +*/ template <> void caffe_add_scalar(const int N, const float alpha, float* Y) { for (int i = 0; i < N; ++i) { @@ -82,28 +135,6 @@ void caffe_add_scalar(const int N, const double alpha, double* Y) { } } -template -void caffe_copy(const int N, const Dtype* X, Dtype* Y) { - if (X != Y) { - if (Caffe::mode() == Caffe::GPU) { -#ifndef CPU_ONLY - // NOLINT_NEXT_LINE(caffe/alt_fn) - CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); -#else - NO_GPU; -#endif - } else { - memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) - } - } -} - -template void caffe_copy(const int N, const int* X, int* Y); -template void caffe_copy(const int N, const unsigned int* X, - unsigned int* Y); -template void caffe_copy(const int N, const float* X, float* Y); -template void caffe_copy(const int N, const double* X, double* Y); - template <> void caffe_scal(const int N, const float alpha, float *X) { cblas_sscal(N, alpha, X, 1); @@ -116,19 +147,18 @@ void caffe_scal(const int N, const double alpha, double *X) { template <> void caffe_cpu_axpby(const int N, const float alpha, const float* X, - const float beta, float* Y) { + const float beta, float* Y) { cblas_saxpby(N, alpha, X, 1, beta, Y, 1); } template <> void caffe_cpu_axpby(const int N, const double alpha, const double* X, - const double beta, double* Y) { + const double beta, double* Y) { cblas_daxpby(N, alpha, X, 1, beta, Y, 1); } template <> -void caffe_add(const int n, const float* a, const float* b, - float* y) { +void caffe_add(const int n, const float* a, const float* b, float* y) { vsAdd(n, a, b, y); } @@ -139,8 +169,7 @@ void caffe_add(const int n, const double* a, const double* b, } template <> -void caffe_sub(const int n, const float* a, const float* b, - float* y) { +void caffe_sub(const int n, const float* a, const float* b, float* y) { vsSub(n, a, b, y); } @@ -151,8 +180,7 @@ void caffe_sub(const int n, const double* a, const double* b, } template <> -void caffe_mul(const int n, const float* a, const float* b, - float* y) { +void caffe_mul(const int n, const float* a, const float* b, float* y) { vsMul(n, a, b, y); } @@ -163,8 +191,67 @@ void caffe_mul(const int n, const double* a, const double* b, } template <> -void caffe_div(const int n, const float* a, const float* b, - float* y) { +float caffe_cpu_strided_dot(const int n, const float* x, const int incx, + const float* y, const int incy) { + return cblas_sdot(n, x, incx, y, incy); +} + +template <> +double caffe_cpu_strided_dot(const int n, const double* x, + const int incx, const double* y, const int incy) { + return cblas_ddot(n, x, incx, y, incy); +} + +template +void caffe_set(const int N, const Dtype alpha, Dtype* Y) { + if (alpha == 0) { + memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } +} + +template void caffe_set(const int N, const int alpha, int* Y); +template void caffe_set(const int N, const float alpha, float* Y); +template void caffe_set(const int N, const double alpha, double* Y); + +template <> +void caffe_log(const int n, const float* a, float* y) { + vsLn(n, a, y); +} + +template <> +void caffe_log(const int n, const double* a, double* y) { + vdLn(n, a, y); +} + +template +void caffe_copy(const int N, const Dtype* X, Dtype* Y) { + if (X != Y) { + memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + } +} + +template void caffe_copy(const int N, const int* X, int* Y); +template void caffe_copy(const int N, const unsigned int* X, + unsigned int* Y); +template void caffe_copy(const int N, const float* X, float* Y); +template void caffe_copy(const int N, const double* X, double* Y); + +template <> +void caffe_abs(const int n, const float* a, float* y) { + vsAbs(n, a, y); +} + +template <> +void caffe_abs(const int n, const double* a, double* y) { + vdAbs(n, a, y); +} + +template <> +void caffe_div(const int n, const float* a, const float* b, float* y) { vsDiv(n, a, b, y); } @@ -175,8 +262,7 @@ void caffe_div(const int n, const double* a, const double* b, } template <> -void caffe_powx(const int n, const float* a, const float b, - float* y) { +void caffe_powx(const int n, const float* a, const float b, float* y) { vsPowx(n, a, b, y); } @@ -206,84 +292,57 @@ void caffe_exp(const int n, const double* a, double* y) { vdExp(n, a, y); } -template <> -void caffe_log(const int n, const float* a, float* y) { - vsLn(n, a, y); -} - -template <> -void caffe_log(const int n, const double* a, double* y) { - vdLn(n, a, y); -} - -template <> -void caffe_abs(const int n, const float* a, float* y) { - vsAbs(n, a, y); -} - -template <> -void caffe_abs(const int n, const double* a, double* y) { - vdAbs(n, a, y); -} - unsigned int caffe_rng_rand() { return (*caffe_rng())(); } template Dtype caffe_nextafter(const Dtype b) { - return boost::math::nextafter( - b, std::numeric_limits::max()); + return boost::math::nextafter < Dtype + > (b, std::numeric_limits < Dtype > ::max()); } - -template -float caffe_nextafter(const float b); - -template -double caffe_nextafter(const double b); +template float caffe_nextafter(const float b); +template double caffe_nextafter(const double b); template void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) { CHECK_GE(n, 0); CHECK(r); CHECK_LE(a, b); - boost::uniform_real random_distribution(a, caffe_nextafter(b)); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); + boost::uniform_real < Dtype + > random_distribution(a, caffe_nextafter(b)); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); for (int i = 0; i < n; ++i) { r[i] = variate_generator(); } -} -template -void caffe_rng_uniform(const int n, const float a, const float b, - float* r); + //LOG(INFO) << "caffe_rng_uniform"; +} -template -void caffe_rng_uniform(const int n, const double a, const double b, - double* r); +template void caffe_rng_uniform(const int n, const float a, const float b, + float* r); +template void caffe_rng_uniform(const int n, const double a, const double b, + double* r); template -void caffe_rng_gaussian(const int n, const Dtype a, - const Dtype sigma, Dtype* r) { +void caffe_rng_gaussian(const int n, const Dtype a, const Dtype sigma, + Dtype* r) { CHECK_GE(n, 0); CHECK(r); CHECK_GT(sigma, 0); - boost::normal_distribution random_distribution(a, sigma); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); + boost::normal_distribution < Dtype > random_distribution(a, sigma); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); for (int i = 0; i < n; ++i) { r[i] = variate_generator(); } } -template -void caffe_rng_gaussian(const int n, const float mu, - const float sigma, float* r); - -template -void caffe_rng_gaussian(const int n, const double mu, - const double sigma, double* r); +template void caffe_rng_gaussian(const int n, const float mu, const float sigma, + float* r); +template void caffe_rng_gaussian(const int n, const double mu, + const double sigma, double* r); template void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { @@ -291,19 +350,16 @@ void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { CHECK(r); CHECK_GE(p, 0); CHECK_LE(p, 1); - boost::bernoulli_distribution random_distribution(p); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); + boost::bernoulli_distribution < Dtype > random_distribution(p); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); for (int i = 0; i < n; ++i) { r[i] = variate_generator(); } } -template -void caffe_rng_bernoulli(const int n, const double p, int* r); - -template -void caffe_rng_bernoulli(const int n, const float p, int* r); +template void caffe_rng_bernoulli(const int n, const double p, int* r); +template void caffe_rng_bernoulli(const int n, const float p, int* r); template void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { @@ -311,61 +367,45 @@ void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { CHECK(r); CHECK_GE(p, 0); CHECK_LE(p, 1); - boost::bernoulli_distribution random_distribution(p); - boost::variate_generator > - variate_generator(caffe_rng(), random_distribution); + boost::bernoulli_distribution < Dtype > random_distribution(p); + boost::variate_generator > variate_generator( + caffe_rng(), random_distribution); for (int i = 0; i < n; ++i) { r[i] = static_cast(variate_generator()); } } -template -void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); - -template -void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); +template void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); +template void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); template <> -float caffe_cpu_strided_dot(const int n, const float* x, const int incx, - const float* y, const int incy) { - return cblas_sdot(n, x, incx, y, incy); +float caffe_cpu_dot(const int n, const float* x, const float* y) { + return cblas_sdot(n, x, 1, y, 1); } template <> -double caffe_cpu_strided_dot(const int n, const double* x, - const int incx, const double* y, const int incy) { - return cblas_ddot(n, x, incx, y, incy); -} - -template -Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) { - return caffe_cpu_strided_dot(n, x, 1, y, 1); +double caffe_cpu_dot(const int n, const double* x, const double* y) { + return cblas_ddot(n, x, 1, y, 1); } -template -float caffe_cpu_dot(const int n, const float* x, const float* y); - -template -double caffe_cpu_dot(const int n, const double* x, const double* y); - template <> int caffe_cpu_hamming_distance(const int n, const float* x, - const float* y) { + const float* y) { int dist = 0; for (int i = 0; i < n; ++i) { - dist += __builtin_popcount(static_cast(x[i]) ^ - static_cast(y[i])); + dist += __builtin_popcount( + static_cast(x[i]) ^ static_cast(y[i])); } return dist; } template <> int caffe_cpu_hamming_distance(const int n, const double* x, - const double* y) { + const double* y) { int dist = 0; for (int i = 0; i < n; ++i) { - dist += __builtin_popcountl(static_cast(x[i]) ^ - static_cast(y[i])); + dist += __builtin_popcountl( + static_cast(x[i]) ^ static_cast(y[i])); } return dist; } @@ -380,18 +420,637 @@ double caffe_cpu_asum(const int n, const double* x) { return cblas_dasum(n, x, 1); } +INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sign); +INSTANTIATE_CAFFE_CPU_UNARY_FUNC (sgnbit); +INSTANTIATE_CAFFE_CPU_UNARY_FUNC (fabs); + template <> void caffe_cpu_scale(const int n, const float alpha, const float *x, - float* y) { + float* y) { cblas_scopy(n, x, 1, y, 1); cblas_sscal(n, alpha, y, 1); } template <> void caffe_cpu_scale(const int n, const double alpha, const double *x, - double* y) { + double* y) { cblas_dcopy(n, x, 1, y, 1); cblas_dscal(n, alpha, y, 1); } +#ifndef CPU_ONLY +//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) +// - (x[index] < Dtype(0))); +//DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); + +template <> +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, (cl_float) beta, (cl_mem) C, + 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template <> +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasDgemm(amdDevice.col, transB, transA, N, M, K, alpha, + (cl_mem) B, 0, ldb, (cl_mem) A, 0, lda, beta, (cl_mem) C, + 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template <> +cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const int offA, const float* B, + const int offB, const float beta, float* C, const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + (cl_mem) C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, + &event)); + return event; +} + +template <> +cl_event caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const int offA, const double* B, + const int offB, const double beta, double* C, const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasDgemm(amdDevice.col, transB, transA, N, M, K, alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, beta, + (cl_mem) C, offC, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, + &event)); + return event; +} + +template <> +cl_event caffe_gpu_gemm(cl_command_queue *queue, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const float alpha, const float* A, const int offA, + const float* B, const int offB, const float beta, float* C, + const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float) alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, (cl_float) beta, + (cl_mem) C, offC, ldc, 1, queue, 0, NULL, &event)); + return event; +} + +template <> +cl_event caffe_gpu_gemm(cl_command_queue *queue, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const double alpha, const double* A, + const int offA, const double* B, const int offB, const double beta, + double* C, const int offC) { + cl_event event; + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + clblasTranspose transB = + (TransB == CblasNoTrans) ? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + CLBLAS_CHECK( + clblasDgemm(amdDevice.col, transB, transA, N, M, K, alpha, + (cl_mem) B, offB, ldb, (cl_mem) A, offA, lda, beta, + (cl_mem) C, offC, ldc, 1, queue, 0, NULL, &event)); + return event; +} + +template <> +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const float alpha, const float* A, size_t offA, int lda, + const float* x, size_t offx, const float beta, int incx, float* y, + size_t offy, int incy) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( + clblasSgemv(amdDevice.row, transA, M, N, (cl_float) alpha, (cl_mem) A, + offA, lda, (cl_mem) x, offx, incx, (cl_float) beta, (cl_mem) y, offy, + incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template <> +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const double alpha, const double* A, size_t offA, int lda, + const double* x, size_t offx, const double beta, int incx, double* y, + size_t offy, int incy) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( + clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, + offA, lda, (cl_mem) x, offx, incx, (cl_double) beta, (cl_mem) y, offy, + incy, 1, &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template <> +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( + clblasSgemv(amdDevice.row, transA, M, N, (cl_float) alpha, (cl_mem) A, 0, + N, (cl_mem) x, 0, 1, (cl_float) beta, (cl_mem) y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template <> +void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + clblasTranspose transA = + (TransA == CblasNoTrans) ? clblasNoTrans : clblasTrans; + CLBLAS_CHECK( + clblasDgemv(amdDevice.row, transA, M, N, (cl_double) alpha, (cl_mem) A, 0, + N, (cl_mem) x, 0, 1, (cl_double) beta, (cl_mem) y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template <> +void caffe_gpu_axpy(const int N, const float alpha, const float* X, + float* Y) { + CLBLAS_CHECK( + clblasSaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template <> +void caffe_gpu_axpy(const int N, const double alpha, const double* X, + double* Y) { + CLBLAS_CHECK( + clblasDaxpy(N, alpha, (cl_mem) X, 0, 1, (cl_mem) Y, 0, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); +} + +template <> +void caffe_gpu_sgnbit(const int n, const float* x, float* y) { + caffe_gpu_signbit(n, x, y); +} + +template <> +void caffe_gpu_sgnbit(const int n, const double* x, double* y) { + caffe_gpu_signbit(n, x, y); +} + +template <> +void caffe_gpu_abs(const int n, const float* x, float* y) { + caffe_gpu_abs_ocl(n, x, y); +} + +template <> +void caffe_gpu_abs(const int n, const double* x, double* y) { + caffe_gpu_abs_ocl(n, x, y); +} + +void caffe_gpu_memcpy(const size_t N, const void *X, void *Y) { + clEnqueueReadBuffer(amdDevice.CommandQueue, (cl_mem) X, CL_TRUE, 0, N, Y, 0, + NULL, NULL); +} +template <> +void caffe_gpu_memcpy(const size_t N, const float* X, float* Y) { + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, + N, 0, NULL, NULL)); +} + +template <> +void caffe_gpu_memcpy(const size_t N, const double* X, double* Y) { + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, + N, 0, NULL, NULL)); +} + +template +void caffe_gpu_copy(const int N, const Dtype* X, Dtype* Y) { + if (X != Y) { + OCL_CHECK( + clEnqueueCopyBuffer(amdDevice.CommandQueue, (cl_mem) X, (cl_mem) Y, 0, 0, + N * sizeof(Dtype), 0, NULL, NULL)); + } +} +template void caffe_gpu_copy(const int N, const float* X, float* Y); +template void caffe_gpu_copy(const int N, const double* X, double* Y); +template void caffe_gpu_copy(const int N, const int* X, int* Y); +template void caffe_gpu_copy(const int N, const unsigned int* X, unsigned int* Y); + +template <> +void caffe_gpu_copy(const int N, const float* X, const int offx, float* Y, const int offy) { + if (X != Y) { + CLBLAS_CHECK( + clblasScopy(N, (cl_mem) X, offx, 1, (cl_mem) Y, offy, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); + } +} + +template <> +void caffe_gpu_copy(const int N, const double* X, const int offx, double* Y, const int offy) { + if (X != Y) { + CLBLAS_CHECK( + clblasDcopy(N, (cl_mem) X, offx, 1, (cl_mem) Y, offy, 1, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL)); + } +} + +template <> +void caffe_gpu_scal(const int N, const float alpha, float *X, const int offx) { + CLBLAS_CHECK( + clblasSscal(N, alpha, (cl_mem) X, offx, 1, 1, &(amdDevice.CommandQueue), 0, + NULL, NULL)); +} + +template <> +void caffe_gpu_scal(const int N, const double alpha, double *X, const int offx) { + CLBLAS_CHECK( + clblasDscal(N, alpha, (cl_mem) X, offx, 1, 1, &(amdDevice.CommandQueue), 0, + NULL, NULL)); +} + +template <> +void caffe_gpu_axpby(const int N, const float alpha, const float* X, + const float beta, float* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); +} + +template <> +void caffe_gpu_axpby(const int N, const double alpha, const double* X, + const double beta, double* Y) { + caffe_gpu_scal(N, beta, Y); + caffe_gpu_axpy(N, alpha, X, Y); +} + +template <> +void caffe_gpu_dot(const int n, const float* x, const float* y, + float* out) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(float)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(float)), NULL, NULL); + clblasSdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), + out, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); +} + +template <> +void caffe_gpu_dot(const int n, const double* x, const double* y, + double * out) { + //need to pass in scratchBuff + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(double)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(double)), NULL, NULL); + clblasDdot(n, d_out, 0, (cl_mem) x, 0, 1, (cl_mem) y, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double), + out, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); +} + +template <> +void caffe_gpu_dot(const int n, const float* x, size_t offx, const float* y, size_t offy, float* out) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(float)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(float)), NULL, NULL); + clblasSdot(n, d_out, 0, (cl_mem) x, offx, 1, (cl_mem) y, offy, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(float), + out, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); +} + +template <> +void caffe_gpu_dot(const int n, const double* x, size_t offx, const double* y, size_t offy, double * out) { + //need to pass in scratchBuff + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(double)), NULL, NULL); + cl_mem d_out = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(double)), NULL, NULL); + clblasDdot(n, d_out, 0, (cl_mem) x, offx, 1, (cl_mem) y, offy, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_out, CL_TRUE, 0, sizeof(double), + out, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_out); +} + +template <> +void caffe_gpu_asum(const int n, const float* x, float* y) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(cl_float)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(cl_float)), NULL, NULL); + clblasSasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y, + 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); +} + +template <> +void caffe_gpu_asum(const int n, const double* x, double* y) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(cl_double)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(cl_double)), NULL, NULL); + clblasDasum(n, d_y, 0, (cl_mem) x, 0, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double), + y, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); +} + +template <> +void caffe_gpu_asum(const int n, const float* x, size_t offx, float* y) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(cl_float)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(cl_float)), NULL, NULL); + clblasSasum(n, d_y, 0, (cl_mem) x, offx, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(float), y, + 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); +} + +template <> +void caffe_gpu_asum(const int n, const double* x, size_t offx, double* y) { + cl_mem scratchBuff = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (n * sizeof(cl_double)), NULL, NULL); + cl_mem d_y = clCreateBuffer(amdDevice.Context, CL_MEM_READ_WRITE, + (1 * sizeof(cl_double)), NULL, NULL); + clblasDasum(n, d_y, 0, (cl_mem) x, offx, 1, scratchBuff, 1, + &(amdDevice.CommandQueue), 0, NULL, NULL); + clEnqueueReadBuffer(amdDevice.CommandQueue, d_y, CL_TRUE, 0, sizeof(double), + y, 0, NULL, NULL); + clReleaseMemObject(scratchBuff); + clReleaseMemObject(d_y); +} + + +template <> +void caffe_gpu_scale(const int n, const float alpha, const float *x, + float* y) { + caffe_gpu_copy(n, x, y); + caffe_gpu_scal(n, alpha, y); +} + +template <> +void caffe_gpu_scale(const int n, const double alpha, const double *x, + double* y) { + caffe_gpu_copy(n, x, y); + caffe_gpu_scal(n, alpha, y); +} + +template <> +void caffe_gpu_scale(const int n, const float alpha, const float *x, + const int offx, float* y, const int offy) { + caffe_gpu_copy(n, x, offx, y, offy); + caffe_gpu_scal(n, alpha, y, offy); +} + +template <> +void caffe_gpu_scale(const int n, const double alpha, const double *x, + const int offx, double* y, const int offy) { + caffe_gpu_copy(n, x, offx, y, offy); + caffe_gpu_scal(n, alpha, y, offy); +} + +template +void set_kernel(const int n, const Dtype alpha, Dtype* y) { + NOT_IMPLEMENTED; +} + +template <> +void caffe_gpu_set(const int N, const float alpha, float* Y, const int offy) { + ocl_memset(Y, alpha, N, offy); +} + +template <> +void caffe_gpu_set(const int N, const double alpha, double* Y, const int offy) { + ocl_memset(Y, alpha, N, offy); +} + +template <> +void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { + kernel_add_scalar(N, alpha, Y); +} + +template <> +void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { + kernel_add_scalar(N, alpha, Y); +} + +template <> +void caffe_gpu_exp(const int N, const float* a, float* y) { + kernel_exp(N, a, y); +} + +template <> +void caffe_gpu_exp(const int N, const double* a, double* y) { + kernel_exp(N, a, y); +} + +template <> +void caffe_gpu_sign(const int N, const float *X, float *Y) { + caffe_gpu_sign_ocl(N, X, Y); +} + + +template <> +void caffe_gpu_sign(const int N, const double *X, double *Y) { + caffe_gpu_sign_ocl(N, X, Y); +} + +template <> +void caffe_gpu_sign(const int N, const float *X, const int offx, float *Y, const int offy) { + caffe_gpu_sign_with_offset_ocl(N, X, offx, Y, offy); +} + + +template <> +void caffe_gpu_sign(const int N, const double *X, const int offx, double *Y, const int offy) { + caffe_gpu_sign_with_offset_ocl(N, X, offx, Y, offy); +} + +template <> +void caffe_gpu_sub(const int N, const float* a, const float* b, + float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_sub(N, a, b, y); +} + +template <> +void caffe_gpu_sub(const int N, const double* a, const double* b, + double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_sub(N, a, b, y); +} + +template <> +void caffe_gpu_mul(const int N, const float* a, const float* b, + float* y) { + kernel_mul(N, a, b, y); +} + +template <> +void caffe_gpu_mul(const int N, const double* a, const double* b, + double* y) { + kernel_mul(N, a, b, y); +} + +template <> +void caffe_gpu_div(const int N, const float* a, const float* b, + float* y) { + kernel_div(N, a, b, y); +} + +template <> +void caffe_gpu_div(const int N, const double* a, const double* b, + double* y) { + kernel_div(N, a, b, y); +} + +template <> +void caffe_gpu_powx(const int N, const float* a, const float alpha, + float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_powx(N, a, alpha, y); +} + +template <> +void caffe_gpu_powx(const int N, const double* a, const double alpha, + double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_powx(N, a, alpha, y); +} + +void popc_kernel(const int n, const float* a, const float* b, uint8_t* y) { + NOT_IMPLEMENTED; +} + +void popcll_kernel(const int n, const double* a, const double* b, uint8_t* y) { + NOT_IMPLEMENTED; +} + +template <> +uint32_t caffe_gpu_hamming_distance(const int n, const float* x, + const float* y) { + NOT_IMPLEMENTED; +} + +template <> +uint32_t caffe_gpu_hamming_distance(const int n, const double* x, + const double* y) { + NOT_IMPLEMENTED; +} + +void caffe_gpu_rng_uniform(const int n, unsigned int* r) { + caffe_gpu_uniform(n, r); +} + +template <> +void caffe_gpu_rng_uniform(const int n, const float a, const float b, + float* r) { + caffe_gpu_uniform(r, n, a, b); // r is a cl_mem object +} +template <> +void caffe_gpu_rng_uniform(const int n, const double a, const double b, + double* r) { + caffe_gpu_uniform(r, n, a, b); // r is a cl_mem object +} + +template <> +void caffe_gpu_rng_gaussian(const int n, const float mu, + const float sigma, float* r) { + caffe_gpu_gaussian(r, n, mu, sigma); // r is a cl_mem object +} + +template <> +void caffe_gpu_rng_gaussian(const int n, const double mu, + const double sigma, double* r) { + caffe_gpu_gaussian(r, n, mu, sigma); // r is a cl_mem object +} + +template <> +void caffe_gpu_log(const int N, const float* a, float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_log(N, a, y); +} + +template <> +void caffe_gpu_log(const int N, const double* a, double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_log(N, a, y); +} + +template <> +void caffe_gpu_add(const int N, const float* a, const float* b, + float* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_add(N, a, b, y); +} + +template <> +void caffe_gpu_add(const int N, const double* a, const double* b, + double* y) { + // NOLINT_NEXT_LINE(whitespace/operators) + kernel_add(N, a, b, y); +} +#endif } // namespace caffe diff --git a/src/caffe/util/math_functions.cpp.protect b/src/caffe/util/math_functions.cpp.protect new file mode 100644 index 00000000..166b709a --- /dev/null +++ b/src/caffe/util/math_functions.cpp.protect @@ -0,0 +1,413 @@ +#include +#include + +#include +#include + +#include "caffe/common.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/util/rng.hpp" + + +namespace caffe { + +template<> +void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, + ldb, beta, C, N); +} + +template<> +void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, + ldb, beta, C, N); +} + +template <> +void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const float alpha, const float* A, const float* x, + const float beta, float* y) { + cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); +} + +template <> +void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const double alpha, const double* A, const double* x, + const double beta, double* y) { + cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); +} + +template <> +void caffe_axpy(const int N, const float alpha, const float* X, + float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); } + +template <> +void caffe_axpy(const int N, const double alpha, const double* X, + double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); } + +template +void caffe_set(const int N, const Dtype alpha, Dtype* Y) { + if (alpha == 0) { + memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + return; + } + for (int i = 0; i < N; ++i) { + Y[i] = alpha; + } +} + +template void caffe_set(const int N, const int alpha, int* Y); +template void caffe_set(const int N, const float alpha, float* Y); +template void caffe_set(const int N, const double alpha, double* Y); + +template <> +void caffe_add_scalar(const int N, const float alpha, float* Y) { + for (int i = 0; i < N; ++i) { + Y[i] += alpha; + } +} + +template <> +void caffe_add_scalar(const int N, const double alpha, double* Y) { + for (int i = 0; i < N; ++i) { + Y[i] += alpha; + } +} + +template +void caffe_copy(const int N, const Dtype* X, Dtype* Y) { + if (X != Y) { + if (Caffe::mode() == Caffe::GPU) { +#ifndef CPU_ONLY + // NOLINT_NEXT_LINE(caffe/alt_fn) + CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); +#else + NO_GPU; +#endif + } else { + memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) + } + } +} + +template void caffe_copy(const int N, const int* X, int* Y); +template void caffe_copy(const int N, const unsigned int* X, + unsigned int* Y); +template void caffe_copy(const int N, const float* X, float* Y); +template void caffe_copy(const int N, const double* X, double* Y); + +template <> +void caffe_scal(const int N, const float alpha, float *X) { + cblas_sscal(N, alpha, X, 1); +} + +template <> +void caffe_scal(const int N, const double alpha, double *X) { + cblas_dscal(N, alpha, X, 1); +} + +template <> +void caffe_cpu_axpby(const int N, const float alpha, const float* X, + const float beta, float* Y) { + cblas_saxpby(N, alpha, X, 1, beta, Y, 1); +} + +template <> +void caffe_cpu_axpby(const int N, const double alpha, const double* X, + const double beta, double* Y) { + cblas_daxpby(N, alpha, X, 1, beta, Y, 1); +} + +template <> +void caffe_add(const int n, const float* a, const float* b, + float* y) { + vsAdd(n, a, b, y); +} + +template <> +void caffe_add(const int n, const double* a, const double* b, + double* y) { + vdAdd(n, a, b, y); +} + +template <> +void caffe_sub(const int n, const float* a, const float* b, + float* y) { + vsSub(n, a, b, y); +} + +template <> +void caffe_sub(const int n, const double* a, const double* b, + double* y) { + vdSub(n, a, b, y); +} + +template <> +void caffe_mul(const int n, const float* a, const float* b, + float* y) { + vsMul(n, a, b, y); +} + +template <> +void caffe_mul(const int n, const double* a, const double* b, + double* y) { + vdMul(n, a, b, y); +} + +template <> +void caffe_div(const int n, const float* a, const float* b, + float* y) { + vsDiv(n, a, b, y); +} + +template <> +void caffe_div(const int n, const double* a, const double* b, + double* y) { + vdDiv(n, a, b, y); +} + +template <> +void caffe_powx(const int n, const float* a, const float b, + float* y) { + vsPowx(n, a, b, y); +} + +template <> +void caffe_powx(const int n, const double* a, const double b, + double* y) { + vdPowx(n, a, b, y); +} + +template <> +void caffe_sqr(const int n, const float* a, float* y) { + vsSqr(n, a, y); +} + +template <> +void caffe_sqr(const int n, const double* a, double* y) { + vdSqr(n, a, y); +} + +template <> +void caffe_exp(const int n, const float* a, float* y) { + vsExp(n, a, y); +} + +template <> +void caffe_exp(const int n, const double* a, double* y) { + vdExp(n, a, y); +} + +template <> +void caffe_log(const int n, const float* a, float* y) { + vsLn(n, a, y); +} + +template <> +void caffe_log(const int n, const double* a, double* y) { + vdLn(n, a, y); +} + +template <> +void caffe_abs(const int n, const float* a, float* y) { + vsAbs(n, a, y); +} + +template <> +void caffe_abs(const int n, const double* a, double* y) { + vdAbs(n, a, y); +} + +unsigned int caffe_rng_rand() { + return (*caffe_rng())(); +} + +template +Dtype caffe_nextafter(const Dtype b) { + return boost::math::nextafter( + b, std::numeric_limits::max()); +} + +template +float caffe_nextafter(const float b); + +template +double caffe_nextafter(const double b); + +template +void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_LE(a, b); + boost::uniform_real random_distribution(a, caffe_nextafter(b)); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } +} + +template +void caffe_rng_uniform(const int n, const float a, const float b, + float* r); + +template +void caffe_rng_uniform(const int n, const double a, const double b, + double* r); + +template +void caffe_rng_gaussian(const int n, const Dtype a, + const Dtype sigma, Dtype* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GT(sigma, 0); + boost::normal_distribution random_distribution(a, sigma); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } +} + +template +void caffe_rng_gaussian(const int n, const float mu, + const float sigma, float* r); + +template +void caffe_rng_gaussian(const int n, const double mu, + const double sigma, double* r); + +template +void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GE(p, 0); + CHECK_LE(p, 1); + boost::bernoulli_distribution random_distribution(p); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = variate_generator(); + } +} + +template +void caffe_rng_bernoulli(const int n, const double p, int* r); + +template +void caffe_rng_bernoulli(const int n, const float p, int* r); + +template +void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { + CHECK_GE(n, 0); + CHECK(r); + CHECK_GE(p, 0); + CHECK_LE(p, 1); + boost::bernoulli_distribution random_distribution(p); + boost::variate_generator > + variate_generator(caffe_rng(), random_distribution); + for (int i = 0; i < n; ++i) { + r[i] = static_cast(variate_generator()); + } +} + +template +void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); + +template +void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); + +template <> +float caffe_cpu_strided_dot(const int n, const float* x, const int incx, + const float* y, const int incy) { + return cblas_sdot(n, x, incx, y, incy); +} + +template <> +double caffe_cpu_strided_dot(const int n, const double* x, + const int incx, const double* y, const int incy) { + return cblas_ddot(n, x, incx, y, incy); +} + +template +Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) { + return caffe_cpu_strided_dot(n, x, 1, y, 1); +} + +template +float caffe_cpu_dot(const int n, const float* x, const float* y); + +template +double caffe_cpu_dot(const int n, const double* x, const double* y); + +template <> +int caffe_cpu_hamming_distance(const int n, const float* x, + const float* y) { + int dist = 0; + for (int i = 0; i < n; ++i) { + dist += __builtin_popcount(static_cast(x[i]) ^ + static_cast(y[i])); + } + return dist; +} + +template <> +int caffe_cpu_hamming_distance(const int n, const double* x, + const double* y) { + int dist = 0; + for (int i = 0; i < n; ++i) { + dist += __builtin_popcountl(static_cast(x[i]) ^ + static_cast(y[i])); + } + return dist; +} + +template <> +float caffe_cpu_asum(const int n, const float* x) { + return cblas_sasum(n, x, 1); +} + +template <> +double caffe_cpu_asum(const int n, const double* x) { + return cblas_dasum(n, x, 1); +} + +template <> +void caffe_cpu_scale(const int n, const float alpha, const float *x, + float* y) { + cblas_scopy(n, x, 1, y, 1); + cblas_sscal(n, alpha, y, 1); +} + +template <> +void caffe_cpu_scale(const int n, const double alpha, const double *x, + double* y) { + cblas_dcopy(n, x, 1, y, 1); + cblas_dscal(n, alpha, y, 1); +} + +template <> +void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + clblasTranspose transA = (TransA == CblasNoTrans)? clblasNoTrans : clblasTrans; + clblasTranspose transB = (TransB == CblasNoTrans)? clblasNoTrans : clblasTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + int ldc = N; + //AMDBLAS_CHECK( clAmdBlasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, ldb, (cl_mem)A, lda, (cl_float)beta, (cl_mem)C, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); + CLBLAS_CHECK( clblasSgemm(amdDevice.col, transB, transA, N, M, K, (cl_float)alpha, (cl_mem)B, 0, ldb, (cl_mem)A, 0, lda, (cl_float)beta, (cl_mem)C, 0, ldc, 1, &(amdDevice.CommandQueue), 0, NULL, NULL) ); +} + +} // namespace caffe diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu index 2631a074..ae71de0f 100644 --- a/src/caffe/util/math_functions.cu +++ b/src/caffe/util/math_functions.cu @@ -24,8 +24,9 @@ void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + CUBLAS_CHECK( + cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, A, lda, &beta, C, N)); } template <> @@ -40,8 +41,9 @@ void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + CUBLAS_CHECK( + cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, + B, ldb, A, lda, &beta, C, N)); } template <> @@ -50,8 +52,9 @@ void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const float beta, float* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, - A, N, x, 1, &beta, y, 1)); + CUBLAS_CHECK( + cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, + &beta, y, 1)); } template <> @@ -60,8 +63,9 @@ void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const double beta, double* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, - A, N, x, 1, &beta, y, 1)); + CUBLAS_CHECK( + cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, + &beta, y, 1)); } template <> @@ -130,14 +134,14 @@ void caffe_gpu_asum(const int n, const double* x, double* y) { template <> void caffe_gpu_scale(const int n, const float alpha, const float *x, - float* y) { + float* y) { CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1)); CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1)); } template <> void caffe_gpu_scale(const int n, const double alpha, const double *x, - double* y) { + double* y) { CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1)); CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1)); } @@ -156,8 +160,8 @@ void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) { return; } // NOLINT_NEXT_LINE(whitespace/operators) - set_kernel<<>>( - N, alpha, Y); +set_kernel<<>>( + N, alpha, Y); } template void caffe_gpu_set(const int N, const int alpha, int* Y); @@ -166,300 +170,300 @@ template void caffe_gpu_set(const int N, const double alpha, double* Y); template __global__ void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] += alpha; - } +CUDA_KERNEL_LOOP(index, n) { + y[index] += alpha; +} } template <> void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { - // NOLINT_NEXT_LINE(whitespace/operators) - add_scalar_kernel<<>>( - N, alpha, Y); +// NOLINT_NEXT_LINE(whitespace/operators) +add_scalar_kernel<<>>( + N, alpha, Y); } template <> void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { // NOLINT_NEXT_LINE(whitespace/operators) - add_scalar_kernel<<>>( - N, alpha, Y); +add_scalar_kernel<<>>( +N, alpha, Y); } template __global__ void add_kernel(const int n, const Dtype* a, - const Dtype* b, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = a[index] + b[index]; - } +const Dtype* b, Dtype* y) { +CUDA_KERNEL_LOOP(index, n) { +y[index] = a[index] + b[index]; +} } template <> void caffe_gpu_add(const int N, const float* a, const float* b, - float* y) { +float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - add_kernel<<>>( - N, a, b, y); +add_kernel<<>>( +N, a, b, y); } template <> void caffe_gpu_add(const int N, const double* a, const double* b, - double* y) { +double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - add_kernel<<>>( - N, a, b, y); +add_kernel<<>>( +N, a, b, y); } template __global__ void sub_kernel(const int n, const Dtype* a, - const Dtype* b, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = a[index] - b[index]; - } +const Dtype* b, Dtype* y) { +CUDA_KERNEL_LOOP(index, n) { +y[index] = a[index] - b[index]; +} } template <> void caffe_gpu_sub(const int N, const float* a, const float* b, - float* y) { +float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - sub_kernel<<>>( - N, a, b, y); +sub_kernel<<>>( +N, a, b, y); } template <> void caffe_gpu_sub(const int N, const double* a, const double* b, - double* y) { +double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - sub_kernel<<>>( - N, a, b, y); +sub_kernel<<>>( +N, a, b, y); } template __global__ void mul_kernel(const int n, const Dtype* a, - const Dtype* b, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = a[index] * b[index]; - } +const Dtype* b, Dtype* y) { +CUDA_KERNEL_LOOP(index, n) { +y[index] = a[index] * b[index]; +} } template <> -void caffe_gpu_mul(const int N, const float* a, - const float* b, float* y) { +void caffe_gpu_mul(const int N, const float* a, const float* b, +float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - mul_kernel<<>>( - N, a, b, y); +mul_kernel<<>>( +N, a, b, y); } template <> -void caffe_gpu_mul(const int N, const double* a, - const double* b, double* y) { +void caffe_gpu_mul(const int N, const double* a, const double* b, +double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - mul_kernel<<>>( - N, a, b, y); +mul_kernel<<>>( +N, a, b, y); } template __global__ void div_kernel(const int n, const Dtype* a, - const Dtype* b, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = a[index] / b[index]; - } +const Dtype* b, Dtype* y) { +CUDA_KERNEL_LOOP(index, n) { +y[index] = a[index] / b[index]; +} } template <> -void caffe_gpu_div(const int N, const float* a, - const float* b, float* y) { +void caffe_gpu_div(const int N, const float* a, const float* b, +float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - div_kernel<<>>( - N, a, b, y); +div_kernel<<>>( +N, a, b, y); } template <> -void caffe_gpu_div(const int N, const double* a, - const double* b, double* y) { +void caffe_gpu_div(const int N, const double* a, const double* b, +double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - div_kernel<<>>( - N, a, b, y); +div_kernel<<>>( +N, a, b, y); } template __global__ void abs_kernel(const int n, const Dtype* a, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = abs(a[index]); - } +CUDA_KERNEL_LOOP(index, n) { +y[index] = abs(a[index]); +} } template <> void caffe_gpu_abs(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - abs_kernel<<>>( - N, a, y); +abs_kernel<<>>( +N, a, y); } template <> void caffe_gpu_abs(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - abs_kernel<<>>( - N, a, y); +abs_kernel<<>>( +N, a, y); } - template __global__ void exp_kernel(const int n, const Dtype* a, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = exp(a[index]); - } +CUDA_KERNEL_LOOP(index, n) { +y[index] = exp(a[index]); +} } template <> void caffe_gpu_exp(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - exp_kernel<<>>( - N, a, y); +exp_kernel<<>>( +N, a, y); } template <> void caffe_gpu_exp(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - exp_kernel<<>>( - N, a, y); +exp_kernel<<>>( +N, a, y); } template __global__ void log_kernel(const int n, const Dtype* a, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = log(a[index]); - } +CUDA_KERNEL_LOOP(index, n) { +y[index] = log(a[index]); +} } template <> void caffe_gpu_log(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - log_kernel<<>>( - N, a, y); +log_kernel<<>>( +N, a, y); } template <> void caffe_gpu_log(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - log_kernel<<>>( - N, a, y); +log_kernel<<>>( +N, a, y); } template __global__ void powx_kernel(const int n, const Dtype* a, - const Dtype alpha, Dtype* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = pow(a[index], alpha); - } +const Dtype alpha, Dtype* y) { +CUDA_KERNEL_LOOP(index, n) { +y[index] = pow(a[index], alpha); +} } template <> -void caffe_gpu_powx(const int N, const float* a, - const float alpha, float* y) { +void caffe_gpu_powx(const int N, const float* a, const float alpha, +float* y) { // NOLINT_NEXT_LINE(whitespace/operators) - powx_kernel<<>>( - N, a, alpha, y); +powx_kernel<<>>( +N, a, alpha, y); } template <> -void caffe_gpu_powx(const int N, const double* a, - const double alpha, double* y) { +void caffe_gpu_powx(const int N, const double* a, const double alpha, +double* y) { // NOLINT_NEXT_LINE(whitespace/operators) - powx_kernel<<>>( - N, a, alpha, y); +powx_kernel<<>>( +N, a, alpha, y); } DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) - - (x[index] < Dtype(0))); +- (x[index] < Dtype(0))); DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); -__global__ void popc_kernel(const int n, const float* a, - const float* b, uint8_t* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = __popc(static_cast(a[index]) ^ - static_cast(b[index])); - } +__global__ void popc_kernel(const int n, const float* a, const float* b, +uint8_t* y) { +CUDA_KERNEL_LOOP(index, n) +{ +y[index] = __popc( +static_cast(a[index]) ^ static_cast(b[index])); +} } -__global__ void popcll_kernel(const int n, const double* a, - const double* b, uint8_t* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = __popcll(static_cast(a[index]) ^ - static_cast(b[index])); - } +__global__ void popcll_kernel(const int n, const double* a, const double* b, +uint8_t* y) { +CUDA_KERNEL_LOOP(index, n) +{ +y[index] = __popcll( +static_cast(a[index]) ^ static_cast(b[index])); +} } template <> uint32_t caffe_gpu_hamming_distance(const int n, const float* x, - const float* y) { +const float* y) { // TODO: Fix caffe_gpu_hamming_distance (see failing unit test // TestHammingDistanceGPU in test_math_functions.cpp). - NOT_IMPLEMENTED; - thrust::device_vector popcounts(n); +NOT_IMPLEMENTED; +thrust::device_vector < uint8_t > popcounts(n); // NOLINT_NEXT_LINE(whitespace/operators) - popc_kernel<<>>( - n, x, y, thrust::raw_pointer_cast(popcounts.data())); - return thrust::reduce(popcounts.begin(), popcounts.end(), - (uint32_t) 0, thrust::plus()); +popc_kernel<<>>( +n, x, y, thrust::raw_pointer_cast(popcounts.data())); +return thrust::reduce(popcounts.begin(), popcounts.end(), (uint32_t) 0, +thrust::plus()); } template <> uint32_t caffe_gpu_hamming_distance(const int n, const double* x, - const double* y) { +const double* y) { // TODO: Fix caffe_gpu_hamming_distance (see failing unit test // TestHammingDistanceGPU in test_math_functions.cpp). - NOT_IMPLEMENTED; - thrust::device_vector popcounts(n); +NOT_IMPLEMENTED; +thrust::device_vector < uint8_t > popcounts(n); // NOLINT_NEXT_LINE(whitespace/operators) - popcll_kernel<<>>( - n, x, y, thrust::raw_pointer_cast(popcounts.data())); - return thrust::reduce(popcounts.begin(), popcounts.end(), - /* NOLINT_NEXT_LINE(build/include_what_you_use) */ - (uint32_t) 0, thrust::plus()); +popcll_kernel<<>>( +n, x, y, thrust::raw_pointer_cast(popcounts.data())); +return thrust::reduce(popcounts.begin(), popcounts.end(), +/* NOLINT_NEXT_LINE(build/include_what_you_use) */ +(uint32_t) 0, thrust::plus()); } void caffe_gpu_rng_uniform(const int n, unsigned int* r) { - CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n)); +CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n)); } template <> void caffe_gpu_rng_uniform(const int n, const float a, const float b, - float* r) { - CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n)); - const float range = b - a; - if (range != static_cast(1)) { - caffe_gpu_scal(n, range, r); - } - if (a != static_cast(0)) { - caffe_gpu_add_scalar(n, a, r); - } +float* r) { +CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n)); +const float range = b - a; +if (range != static_cast(1)) { +caffe_gpu_scal(n, range, r); +} +if (a != static_cast(0)) { +caffe_gpu_add_scalar(n, a, r); +} } template <> void caffe_gpu_rng_uniform(const int n, const double a, const double b, - double* r) { - CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n)); - const double range = b - a; - if (range != static_cast(1)) { - caffe_gpu_scal(n, range, r); - } - if (a != static_cast(0)) { - caffe_gpu_add_scalar(n, a, r); - } +double* r) { +CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n)); +const double range = b - a; +if (range != static_cast(1)) { +caffe_gpu_scal(n, range, r); +} +if (a != static_cast(0)) { +caffe_gpu_add_scalar(n, a, r); +} } template <> void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, - float* r) { - CURAND_CHECK( - curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma)); +float* r) { +CURAND_CHECK(curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma)); } template <> void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, - double* r) { - CURAND_CHECK( - curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma)); +double* r) { +CURAND_CHECK( +curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma)); } } // namespace caffe diff --git a/src/caffe/util/ocl_util.cpp b/src/caffe/util/ocl_util.cpp new file mode 100644 index 00000000..bc2aea35 --- /dev/null +++ b/src/caffe/util/ocl_util.cpp @@ -0,0 +1,96 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +#include +#include +#include +#include +#include +#include "caffe/common.hpp" +#include "caffe/util/ocl_util.hpp" +namespace caffe { + +#ifndef CPU_ONLY + +template extern std::string get_dtype_suffix(); + +template +void ocl_memset(Dtype* buffer, const Dtype value, const int count, const int buf_offset) { + std::string kernel_name = std::string("oclmem") + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int err = 0; + err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer); + err |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &value); + err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count); + err |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &buf_offset); + OCL_CHECK(err); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + +} + +template void ocl_memset(int* buffer, const int value, const int count, const int buf_offset); +template void ocl_memset(float* buffer, const float value, const int count, const int buf_offset); +template void ocl_memset(double* buffer, const double value, const int count, const int buf_offset); + +void ocl_memset(cl_mem buffer, const int value, + const int count) { + std::string kernel_name = std::string("OCL_memset2"); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int err; + err = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &buffer); + err |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &value); + err |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &count); + OCL_CHECK(err); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + +} + +void eventCallback(cl_event event, cl_int event_status, void* user_data) { + cl_ulong ev_start_time = (cl_ulong) 0; + cl_ulong ev_end_time = (cl_ulong) 0; + double run_time; + OCL_CHECK( + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, + sizeof(cl_ulong), &ev_start_time, NULL)); + OCL_CHECK( + clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), + &ev_end_time, NULL)); + run_time = (double) (ev_end_time - ev_start_time); + printf("The kernel's running time is %f s\n", run_time * 1.0e-9); +} + +#endif +} // namespace caffe diff --git a/src/caffe/util/ocl_wrapper.cpp b/src/caffe/util/ocl_wrapper.cpp new file mode 100644 index 00000000..0b4cbf6f --- /dev/null +++ b/src/caffe/util/ocl_wrapper.cpp @@ -0,0 +1,2017 @@ +/************************************************************************************* + * Copyright (c) 2015, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + +#include +#include +#include +#include +#include +#include "caffe/common.hpp" +#include "caffe/util/ocl_util.hpp" +#include "caffe/util/ocl_wrapper.hpp" +namespace caffe { + +#ifndef CPU_ONLY +typedef unsigned int uint32_t; +struct array4x32 { + uint32_t v[4]; +}; +template +void caffe_gpu_bernoulli(int* a, const unsigned int n, Dtype inf, Dtype sup, + Dtype threshold) { + std::string kernel_name = "RNGBernoulli" + get_dtype_suffix(); + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*) &rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*) &inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*) &sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(Dtype), (void*) &threshold); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*) &nrounds); + ret |= clSetKernelArg(ker_rand, 6, sizeof(cl_uint), (void*) &size); + OCL_CHECK(ret); + + size_t globalws[1] = { size }; + size_t localws[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, + globalws, localws, 0, NULL, NULL)); +} +template void caffe_gpu_bernoulli(int* a, const unsigned int n, + float inf, float sup, float threshold); +template void caffe_gpu_bernoulli(int* a, const unsigned int n, + double inf, double sup, double threshold); + +template +void transform_gpu(Dtype* src, Dtype* dst, const int top_offset, const int N_, + const int M_, const int packing_num) { + std::string kernel_name = "transform" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &src); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &dst); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &top_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &N_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &M_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &packing_num); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size2[] = { (size_t)(M_ * packing_num) }; + size_t uiLocal_Work_Size2[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); +} + +template void transform_gpu(float* src, float* dst, const int top_offset, + const int N_, const int M_, const int packing_num); +template void transform_gpu(double* src, double* dst, + const int top_offset, const int N_, const int M_, const int packing_num); + +template +void get_max_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* bottom_data, Dtype* scale_data) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &scale_data)); + + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void get_max_gpu(cl_kernel Kernel, const int num, const int dim, + const float* bottom_data, float* scale_data); +template void get_max_gpu(cl_kernel Kernel, const int num, + const int dim, const double* bottom_data, double* scale_data); + +template +void caffe_gpu_uniform(Dtype* a, const unsigned int n, Dtype inf, Dtype sup, unsigned int seed_) +{ + static unsigned c = 0; + if ((n == 0) || (a == NULL)) { + c = seed_; + return; + } + std::string kernel_name = "RNGUniform" + get_dtype_suffix(); + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*)&inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*)&sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*)&nrounds); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&size); + OCL_CHECK(ret); + + size_t globalws[1] = {size}; + size_t localws[1] = {256}; + OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); +} +template void caffe_gpu_uniform(float* a, const unsigned int n, float inf, float sup, unsigned int seed_); +template void caffe_gpu_uniform(double* a, const unsigned int n, double inf, double sup, unsigned int seed_); + +void caffe_gpu_uniform(const unsigned int n, unsigned int *r, unsigned int _seed) +{ + static unsigned c = 0; + if ((n == 0) || (r == NULL)) { + c = _seed; + return; + } + std::string kernel_name = "PRNG_threefry4x32_uint_uniform"; + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_uint inf = 0; + cl_uint sup = UINT_MAX; + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&r); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(cl_uint), (void*)&inf); + ret |= clSetKernelArg(ker_rand, 3, sizeof(cl_uint), (void*)&sup); + ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*)&nrounds); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&size); + OCL_CHECK(ret); + + size_t globalws[1] = {size}; + size_t localws[1] = {256}; + OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); +} + +template +void caffe_gpu_gaussian(Dtype* a, const unsigned int n, Dtype E, Dtype V) +{ + std::string kernel_name = "RNGGaussian" + get_dtype_suffix(); + cl_kernel ker_rand = amdDevice.GetKernel(kernel_name); + + static unsigned c = 0; + unsigned nrounds = 20; + array4x32 rndctr4; + rndctr4.v[0] = rndctr4.v[1] = rndctr4.v[2] = rndctr4.v[3] = c++; + cl_uint size = n / 4; //Note: for correctness, we need to make sure n is dividable by 4 + + cl_int ret; + ret = clSetKernelArg(ker_rand, 0, sizeof(cl_mem), (void*)&a); + ret |= clSetKernelArg(ker_rand, 1, sizeof(array4x32), (void*)&rndctr4); + ret |= clSetKernelArg(ker_rand, 2, sizeof(Dtype), (void*)&E); + ret |= clSetKernelArg(ker_rand, 3, sizeof(Dtype), (void*)&V); + ret |= clSetKernelArg(ker_rand, 4, sizeof(cl_uint), (void*)&nrounds); + ret |= clSetKernelArg(ker_rand, 5, sizeof(cl_uint), (void*)&size); + OCL_CHECK(ret); + + size_t globalws[1] = {size}; + size_t localws[1] = {256}; + OCL_CHECK (clEnqueueNDRangeKernel(amdDevice.CommandQueue, ker_rand, 1, NULL, globalws, localws, 0, NULL, NULL) ); +} +template void caffe_gpu_gaussian(float* a, const unsigned int n, float E, float V); +template void caffe_gpu_gaussian(double* a, const unsigned int n, double E, double V); + +template +void exp_gpu(cl_kernel Kernel, const int num, const Dtype* data, Dtype* out) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void exp_gpu(cl_kernel Kernel, const int num, const float* data, + float* out); +template void exp_gpu(cl_kernel Kernel, const int num, + const double* data, double* out); + +template +void softmax_div_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* scale, Dtype* data) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &scale)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); + + size_t Global_Work_Size[1] = { (size_t)(num * dim) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void softmax_div_gpu(cl_kernel Kernel, const int num, + const int dim, const float* scale, float* data); +template void softmax_div_gpu(cl_kernel Kernel, const int num, + const int dim, const double* scale, double* data); + +template +Dtype softmax_gpu(cl_kernel Kernel, const int num, const int dim, + const Dtype* prob_data, const Dtype* label, cl_mem d_loss) { + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_mem), (void*) &prob_data)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &d_loss)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 5, 256 * sizeof(Dtype), NULL)); + + size_t globalws[1] = { 256 }; + size_t localws[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, globalws, + localws, 0, NULL, NULL)); + void* h_loss = clEnqueueMapBuffer(amdDevice.CommandQueue, d_loss, CL_TRUE, + CL_MAP_READ, 0, sizeof(Dtype), 0, NULL, NULL, NULL); + Dtype loss = *(Dtype*) h_loss; + clEnqueueUnmapMemObject(amdDevice.CommandQueue, d_loss, h_loss, 0, NULL, + NULL); + + return loss; +} + +template float softmax_gpu(cl_kernel Kernel, const int num, + const int dim, const float* prob_data, const float* label, cl_mem d_loss); +template double softmax_gpu(cl_kernel Kernel, const int num, + const int dim, const double* prob_data, const double* label, cl_mem d_loss); + +template +void kernel_channel_max(const int num, const int channels, + const int spatial_dim, const Dtype* data, Dtype* out) { + std::string kernel_name = "kernel_channel_max" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_channel_max(const int num, const int channels, + const int spatial_dim, const float* data, float* out); +template void kernel_channel_max(const int num, const int channels, + const int spatial_dim, const double* data, double* out); + +template +void kernel_channel_subtract(const int count, const int num, const int channels, + const int spatial_dim, const Dtype* channel_max, Dtype* data) { + std::string kernel_name = "kernel_channel_subtract" + + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_max)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_channel_subtract(const int count, const int num, + const int channels, const int spatial_dim, const float* channel_max, + float* data); +template void kernel_channel_subtract(const int count, const int num, + const int channels, const int spatial_dim, const double* channel_max, + double* data); + +template +void kernel_mul(const int count, const Dtype* a, const Dtype* b, Dtype* out) { + std::string kernel_name = "kernel_mul" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_mul(const int count, const float* a, const float* b, + float* out); +template void kernel_mul(const int count, const double* a, + const double* b, double* out); + +template +void kernel_add_scalar(const int count, const Dtype data, Dtype* out) { + std::string kernel_name = "kernel_add_scalar" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_add_scalar(const int count, const float data, + float* out); +template void kernel_add_scalar(const int count, const double data, + double* out); + +template +void kernel_powx(const int count, const Dtype* data, const Dtype alpha, + Dtype* out) { + std::string kernel_name = "kernel_powx" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_powx(const int count, const float* data, + const float alpha, float* out); +template void kernel_powx(const int count, const double* data, + const double alpha, double* out); + +template +void kernel_div(const int count, const Dtype* a, const Dtype* b, Dtype* out) { + std::string kernel_name = "kernel_div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_div(const int count, const float* a, const float* b, + float* out); +template void kernel_div(const int count, const double* a, + const double* b, double* out); + +template +void kernel_add(const int count, const Dtype* a, const Dtype* b, Dtype* out) { + std::string kernel_name = "kernel_add" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_add(const int count, const float* a, const float* b, + float* out); +template void kernel_add(const int count, const double* a, + const double* b, double* out); + +template +void kernel_sub(const int count, const Dtype* a, const Dtype* b, Dtype* out) { + std::string kernel_name = "kernel_sub" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_sub(const int count, const float* a, const float* b, + float* out); +template void kernel_sub(const int count, const double* a, + const double* b, double* out); + +template +void kernel_log(const int count, const Dtype* data, Dtype* out) { + std::string kernel_name = "kernel_log" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_log(const int count, const float* data, float* out); +template void kernel_log(const int count, const double* data, + double* out); + +template +void kernel_exp(const int count, const Dtype* data, Dtype* out) { + std::string kernel_name = "kernel_exp" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &out)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_exp(const int count, const float* data, float* out); +template void kernel_exp(const int count, const double* data, + double* out); + +template +void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, const Dtype* data, Dtype* channel_sum) { + std::string kernel_name = "kernel_channel_sum" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum)); + + size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, const float* data, float* channel_sum); +template void kernel_channel_sum(const int num, const int channels, + const int spatial_dim, const double* data, double* channel_sum); + +template +void kernel_channel_div(const int count, const int num, const int channels, + const int spatial_dim, const Dtype* channel_sum, Dtype* data) { + std::string kernel_name = "kernel_channel_div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &channel_sum)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &data)); + + size_t Global_Work_Size[1] = { (size_t) count }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_channel_div(const int count, const int num, + const int channels, const int spatial_dim, const float* channel_sum, + float* data); +template void kernel_channel_div(const int count, const int num, + const int channels, const int spatial_dim, const double* channel_sum, + double* data); + +template +void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, const Dtype* data_1, const Dtype* data_2, + Dtype* channel_dot) { + std::string kernel_name = "kernel_channel_dot" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &data_1)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &data_2)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &channel_dot)); + + size_t Global_Work_Size[1] = { (size_t)(num * spatial_dim) }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, const float* data_1, const float* data_2, + float* channel_dot); +template void kernel_channel_dot(const int num, const int channels, + const int spatial_dim, const double* data_1, const double* data_2, + double* channel_dot); + +template +void SoftmaxLossForwardGPU(const int nthreads, const Dtype* prob_data, + const Dtype* label, Dtype* loss, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts) { + std::string kernel_name = "SoftmaxLossForwardGPU" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int int_has_ignore_label = has_ignore_label_ ? 1 : 0; + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &prob_data)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &loss)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK( + clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label)); + OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); + + size_t Global_Work_Size[1] = { (size_t) nthreads }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void SoftmaxLossForwardGPU(const int nthreads, + const float* prob_data, const float* label, float* loss, const int num, + const int dim, const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, float* counts); +template void SoftmaxLossForwardGPU(const int nthreads, + const double* prob_data, const double* label, double* loss, const int num, + const int dim, const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, double* counts); + +template +void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top, + const Dtype* label, Dtype* bottom_diff, const int num, const int dim, + const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, Dtype* counts) { + std::string kernel_name = "SoftmaxLossBackwardGPU" + + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + int int_has_ignore_label = has_ignore_label_ ? 1 : 0; + + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &label)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff)); + OCL_CHECK(clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &spatial_dim)); + OCL_CHECK( + clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &int_has_ignore_label)); + OCL_CHECK(clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &ignore_label_)); + OCL_CHECK(clSetKernelArg(Kernel, 9, sizeof(cl_mem), (void*) &counts)); + + size_t Global_Work_Size[1] = { (size_t) nthreads }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void SoftmaxLossBackwardGPU(const int nthreads, + const float* top, const float* label, float* bottom_diff, const int num, + const int dim, const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, float* counts); +template void SoftmaxLossBackwardGPU(const int nthreads, + const double* top, const double* label, double* bottom_diff, const int num, + const int dim, const int spatial_dim, const bool has_ignore_label_, + const int ignore_label_, double* counts); + +template +void scal_gpu(cl_kernel Kernel, const int num, const Dtype alpha, Dtype* data) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data)); + + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void scal_gpu(cl_kernel Kernel, const int num, + const float alpha, float* data); +template void scal_gpu(cl_kernel Kernel, const int num, + const double alpha, double* data); + +template +void diff_gpu(cl_kernel Kernel, const int num, int dim, Dtype* data, + const Dtype* label) { + OCL_CHECK(clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num)); + OCL_CHECK(clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &dim)); + OCL_CHECK(clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &data)); + OCL_CHECK(clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &label)); + + size_t Global_Work_Size[1] = { (size_t) num }; + size_t Local_Work_Size[1] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void diff_gpu(cl_kernel Kernel, const int num, const int dim, + float* data, const float* label); +template void diff_gpu(cl_kernel Kernel, const int num, const int dim, + double* data, const double* label); + +template +void max_pool_fp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + Dtype* top_data) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void max_pool_fp_gpu(cl_kernel Kernel, const int count, + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + float* top_data); +template void max_pool_fp_gpu(cl_kernel Kernel, const int count, + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + double* top_data); + +template +void MaxPoolForward(const int count, const Dtype* bottom_data, const int clnum, + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data, int* mask, + Dtype* top_mask) { + std::string kernel_name = "MaxPoolForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_mem), (void*) &mask); + ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &top_mask); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void MaxPoolForward(const int count, const float* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, float* top_data, int* mask, + float* top_mask); +template void MaxPoolForward(const int count, const double* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, double* top_data, int* mask, + double* top_mask); + +template +void StoPoolForwardTrain(const int count, const Dtype* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* idx_data, Dtype* top_data) { + std::string kernel_name = "StoPoolForwardTrain" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &idx_data); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void StoPoolForwardTrain(const int count, + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, float* idx_data, float* top_data); +template void StoPoolForwardTrain(const int count, + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, double* idx_data, + double* top_data); + +template +void StoPoolForwardTest(const int count, const Dtype* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + Dtype* top_data) { + std::string kernel_name = "StoPoolForwardTest" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); + +} +template void StoPoolForwardTest(const int count, + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, float* top_data); +template void StoPoolForwardTest(const int count, + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_h_, const int kernel_w_, + const int stride_h_, const int stride_w_, double* top_data); + +template +void AvePoolForward(const int count, const Dtype* bottom_data, const int clnum, + const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, Dtype* top_data) { + std::string kernel_name = "AvePoolForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h_); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w_); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void AvePoolForward(const int count, const float* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, float* top_data); +template void AvePoolForward(const int count, const double* bottom_data, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_h_, + const int kernel_w_, const int stride_h_, const int stride_w_, + const int pad_h_, const int pad_w_, double* top_data); + +template +void ave_pool_fp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, Dtype* top_data) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, + const float* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, float* top_data); +template void ave_pool_fp_gpu(cl_kernel Kernel, const int count, + const double* bottom_data, const int clnum, const int channels_, + const int height_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, double* top_data); + +template +void max_pool_bp_gpu(cl_kernel Kernel, const int count, + const Dtype* bottom_data, const Dtype* top_data, const Dtype* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, Dtype* bottom_diff) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void max_pool_bp_gpu(cl_kernel Kernel, const int count, + const float* bottom_data, const float* top_data, const float* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, float* bottom_diff); +template void max_pool_bp_gpu(cl_kernel Kernel, const int count, + const double* bottom_data, const double* top_data, const double* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, double* bottom_diff); + +template +void MaxPoolBackward(const int nthreads, const Dtype* const top_diff, + const int* const mask, const Dtype* const top_mask, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff) { + std::string kernel_name = "MaxPoolBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &mask); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_mask); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_height); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &pooled_width); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 15, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 16, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void MaxPoolBackward(const int nthreads, + const float* const top_diff, const int* const mask, + const float* const top_mask, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + float* const bottom_diff); +template void MaxPoolBackward(const int nthreads, + const double* const top_diff, const int* const mask, + const double* const top_mask, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + double* const bottom_diff); + +template +void AvePoolBackward(const int nthreads, const Dtype* const top_diff, + const int num, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, const int pad_h, + const int pad_w, Dtype* const bottom_diff) { + std::string kernel_name = "AvePoolBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &pad_h); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_int), (void*) &pad_w); + ret |= clSetKernelArg(Kernel, 14, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void AvePoolBackward(const int nthreads, + const float* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + float* const bottom_diff); +template void AvePoolBackward(const int nthreads, + const double* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + double* const bottom_diff); + +template +void StoPoolBackward(const int nthreads, const Dtype* const rand_idx, + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, Dtype* const bottom_diff) { + std::string kernel_name = "StoPoolBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &rand_idx); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_height); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &pooled_width); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &kernel_h); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &kernel_w); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_int), (void*) &stride_h); + ret |= clSetKernelArg(Kernel, 12, sizeof(cl_int), (void*) &stride_w); + ret |= clSetKernelArg(Kernel, 13, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void StoPoolBackward(const int nthreads, + const float* const rand_idx, const float* const top_diff, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, + float* const bottom_diff); +template void StoPoolBackward(const int nthreads, + const double* const rand_idx, const double* const top_diff, const int num, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, + double* const bottom_diff); + +template +void ave_pool_bp_gpu(cl_kernel Kernel, const int count, const Dtype* top_diff, + const int clnum, const int channels_, const int height_, const int width_, + const int pooled_height_, const int pooled_width_, const int kernel_size_, + const int stride_, const int pad_, Dtype* bottom_diff) { + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &clnum); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &channels_); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &height_); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &width_); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &pooled_height_); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &pooled_width_); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &kernel_size_); + ret |= clSetKernelArg(Kernel, 9, sizeof(cl_int), (void*) &stride_); + ret |= clSetKernelArg(Kernel, 10, sizeof(cl_int), (void*) &pad_); + ret |= clSetKernelArg(Kernel, 11, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, + const float* top_diff, const int clnum, const int channels_, + const int intheight_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, float* bottom_diff); +template void ave_pool_bp_gpu(cl_kernel Kernel, const int count, + const double* top_diff, const int clnum, const int channels_, + const int intheight_, const int width_, const int pooled_height_, + const int pooled_width_, const int kernel_size_, const int stride_, + const int pad_, double* bottom_diff); + +template +void PReLUForward(const int count, const int channels, const int dim, + const Dtype* bottom_data, Dtype* top_data, const Dtype* slope_data, + const int div_factor) { + std::string kernel_name = "PReLUForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &slope_data); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_int), (void*) &div_factor); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void PReLUForward(const int count, const int channels, + const int dim, const float* bottom_data, float* top_data, + const float* slope_data, const int div_factor); +template void PReLUForward(const int count, const int channels, + const int dim, const double* bottom_data, double* top_data, + const double* slope_data, const int div_factor); + +template +void PReLUBackward(const int count, const int channels, const int dim, + const Dtype* top_diff, const Dtype* bottom_data, Dtype* bottom_diff, + const Dtype* slope_data, const int div_factor) { + std::string kernel_name = "PReLUBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &dim); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &slope_data); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &div_factor); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void PReLUBackward(const int count, const int channels, + const int dim, const float* top_diff, const float* bottom_data, + float* bottom_diff, const float* slope_data, const int div_factor); +template void PReLUBackward(const int count, const int channels, + const int dim, const double* top_diff, const double* bottom_data, + double* bottom_diff, const double* slope_data, const int div_factor); + +template +void PReLUParamBackward(const int count, const Dtype* top_diff, + const int offset_out, const Dtype* bottom_data, const int offset_in, + Dtype* bottom_diff) { + std::string kernel_name = "PReLUParamBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret = clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offset_out); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_data); + ret = clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offset_in); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void PReLUParamBackward(const int count, const float* top_diff, + const int offset_out, const float* bottom_data, const int offset_in, + float* bottom_diff); +template void PReLUParamBackward(const int count, + const double* top_diff, const int offset_out, const double* bottom_data, + const int offset_in, double* bottom_diff); + +template +void ReLUForward(const int count, const Dtype* bottom_data, Dtype* top_data, + Dtype negative_slope) { + std::string kernel_name = "ReLUForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(Dtype), (void*) &negative_slope); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void ReLUForward(const int count, const float* bottom_data, + float* top_data, float negative_slope); +template void ReLUForward(const int count, const double* bottom_data, + double* top_data, double negative_slope); + +template +void ReLUBackward(const int count, const Dtype* top_diff, + const Dtype* bottom_data, Dtype* bottom_diff, Dtype negative_slope) { + std::string kernel_name = "ReLUBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + ret |= clSetKernelArg(Kernel, 4, sizeof(Dtype), (void*) &negative_slope); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void ReLUBackward(const int count, const float* top_diff, + const float* bottom_data, float* bottom_diff, float negative_slope); +template void ReLUBackward(const int count, const double* top_diff, + const double* bottom_data, double* bottom_diff, double negative_slope); + +template +void SigmoidForward(const int count, const Dtype* bottom_data, + Dtype* top_data) { + std::string kernel_name = "SigmoidForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void SigmoidForward(const int count, const float* bottom_data, + float* top_data); +template void SigmoidForward(const int count, const double* bottom_data, + double* top_data); + +template +void SigmoidBackward(const int count, const Dtype* top_diff, + const Dtype* top_data, Dtype* bottom_diff) { + std::string kernel_name = "SigmoidBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void SigmoidBackward(const int count, const float* top_diff, + const float* top_data, float* bottom_diff); +template void SigmoidBackward(const int count, const double* top_diff, + const double* top_data, double* bottom_diff); + +template +void ThresholdForward(const int count, const Dtype threshold, + const Dtype* bottom_data, Dtype* top_data) { + std::string kernel_name = "ThresholdForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &threshold); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void ThresholdForward(const int count, const float threshold, + const float* bottom_data, float* top_data); +template void ThresholdForward(const int count, const double threshold, + const double* bottom_data, double* top_data); + +template +void TanHForward(const int count, const Dtype* bottom_data, Dtype* top_data) { + std::string kernel_name = "TanHForward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void TanHForward(const int count, const float* bottom_data, + float* top_data); +template void TanHForward(const int count, const double* bottom_data, + double* top_data); + +template +void TanHBackward(const int count, const Dtype* top_diff, const Dtype* top_data, + Dtype* bottom_diff) { + std::string kernel_name = "TanHBackward" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) count }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void TanHBackward(const int count, const float* top_diff, + const float* top_data, float* bottom_diff); +template void TanHBackward(const int count, const double* top_diff, + const double* top_data, double* bottom_diff); + +template +void opttrans(const Dtype* data_im, const int im_offset, const int channels, + const int height, const int width, Dtype* data_opt, const int opt_offset, + const int optnum) { + std::string kernel_name = "opttrans" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + int num_kernels = channels * height * width * optnum; + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &num_kernels); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &data_im); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &im_offset); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(Kernel, 5, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(Kernel, 6, sizeof(cl_mem), (void*) &data_opt); + ret |= clSetKernelArg(Kernel, 7, sizeof(cl_int), (void*) &opt_offset); + ret |= clSetKernelArg(Kernel, 8, sizeof(cl_int), (void*) &optnum); + OCL_CHECK(ret); + + size_t uiGlobal_Work_Size[] = { (size_t) num_kernels }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} + +template void opttrans(const float* data_im, const int im_offset, + const int channels, const int height, const int width, float* data_opt, + const int opt_offset, const int optnum); +template void opttrans(const double* data_im, const int im_offset, + const int channels, const int height, const int width, double* data_opt, + const int opt_offset, const int optnum); + +template +void LRNFillScale(const int nthreads, const Dtype* const in, const int num, + const int channels, const int height, const int width, const int size, + const Dtype alpha_over_size, const Dtype k, Dtype* const scale) { + std::string kernel_name = "LRNFillScale" + get_dtype_suffix(); + cl_kernel LFSkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(LFSkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LFSkernel, 1, sizeof(cl_mem), (void*) &in); + ret |= clSetKernelArg(LFSkernel, 2, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(LFSkernel, 3, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(LFSkernel, 4, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(LFSkernel, 5, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(LFSkernel, 6, sizeof(cl_int), (void*) &size); + ret |= clSetKernelArg(LFSkernel, 7, sizeof(Dtype), (void*) &alpha_over_size); + ret |= clSetKernelArg(LFSkernel, 8, sizeof(Dtype), (void*) &k); + ret |= clSetKernelArg(LFSkernel, 9, sizeof(cl_mem), (void*) &scale); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LFSkernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void LRNFillScale(const int nthreads, const float* const in, + const int num, const int channels, const int height, const int width, + const int size, const float alpha_over_size, const float k, + float* const scale); +template void LRNFillScale(const int nthreads, const double* const in, + const int num, const int channels, const int height, const int width, + const int size, const double alpha_over_size, const double k, + double* const scale); + +template +void LRNComputeOutput(int nthreads, const Dtype* in, Dtype* scale, + Dtype negative_beta, Dtype* out) { + std::string kernel_name = "LRNComputeOutput" + get_dtype_suffix(); + cl_kernel LCOkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(LCOkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LCOkernel, 1, sizeof(cl_mem), (void*) &in); + ret |= clSetKernelArg(LCOkernel, 2, sizeof(cl_mem), (void*) &scale); + ret |= clSetKernelArg(LCOkernel, 3, sizeof(Dtype), (void*) &negative_beta); + ret |= clSetKernelArg(LCOkernel, 4, sizeof(cl_mem), (void*) &out); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size2[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size2[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCOkernel, 1, NULL, + uiGlobal_Work_Size2, uiLocal_Work_Size2, 0, NULL, NULL)); +} +template void LRNComputeOutput(int nthreads, const float* in, + float* scale, float negative_beta, float* out); +template void LRNComputeOutput(int nthreads, const double* in, + double* scale, double negative_beta, double* out); + +template +void LRNComputeDiff(const int nthreads, const Dtype* const bottom_data, + const Dtype* const top_data, const Dtype* const scale, + const Dtype* const top_diff, const int num, const int channels, + const int height, const int width, const int size, + const Dtype negative_beta, const Dtype cache_ratio, + Dtype* const bottom_diff) { + std::string kernel_name = "LRNComputeDiff" + get_dtype_suffix(); + cl_kernel LCDkernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(LCDkernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(LCDkernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(LCDkernel, 2, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(LCDkernel, 3, sizeof(cl_mem), (void*) &scale); + ret |= clSetKernelArg(LCDkernel, 4, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(LCDkernel, 5, sizeof(cl_int), (void*) &num); + ret |= clSetKernelArg(LCDkernel, 6, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(LCDkernel, 7, sizeof(cl_int), (void*) &height); + ret |= clSetKernelArg(LCDkernel, 8, sizeof(cl_int), (void*) &width); + ret |= clSetKernelArg(LCDkernel, 9, sizeof(cl_int), (void*) &size); + ret |= clSetKernelArg(LCDkernel, 10, sizeof(Dtype), (void*) &negative_beta); + ret |= clSetKernelArg(LCDkernel, 11, sizeof(Dtype), (void*) &cache_ratio); + ret |= clSetKernelArg(LCDkernel, 12, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + size_t uiGlobal_Work_Size[] = { (size_t) nthreads }; + size_t uiLocal_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, LCDkernel, 1, NULL, + uiGlobal_Work_Size, uiLocal_Work_Size, 0, NULL, NULL)); +} +template void LRNComputeDiff(const int nthreads, + const float* const bottom_data, const float* const top_data, + const float* const scale, const float* const top_diff, const int num, + const int channels, const int height, const int width, const int size, + const float negative_beta, const float cache_ratio, + float* const bottom_diff); +template void LRNComputeDiff(const int nthreads, + const double* const bottom_data, const double* const top_data, + const double* const scale, const double* const top_diff, const int num, + const int channels, const int height, const int width, const int size, + const double negative_beta, const double cache_ratio, + double* const bottom_diff); + +template +void caffe_gpu_add(const int n, const Dtype* in1, const Dtype* in2, Dtype* y) { + std::string kernel_name = "caffe_gpu_add" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &in1); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &in2); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_add(const int n, const float* in1, + const float* in2, float* y); +template void caffe_gpu_add(const int n, const double* in1, + const double* in2, double* y); + +template +void caffe_gpu_signbit(const int N, const Dtype* X, Dtype * Y) { + std::string kernel_name = "caffe_gpu_sgnbit" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void caffe_gpu_signbit(const int N, const float* X, float * Y); +template void caffe_gpu_signbit(const int N, const double* X, double * Y); + +template +void caffe_gpu_sign_ocl(const int N, const Dtype* X, Dtype * Y) { + std::string kernel_name = "caffe_gpu_sign" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_sign_ocl(const int N, const float* X, float* Y); +template void caffe_gpu_sign_ocl(const int N, const double* X, + double* Y); + +template +void caffe_gpu_sign_with_offset_ocl(const int N, const Dtype* X, const int offx, Dtype * Y, const int offy) { + std::string kernel_name = "caffe_gpu_sign_with_offset" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_int), (void*) &offx); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &Y); + ret |= clSetKernelArg(Kernel, 4, sizeof(cl_int), (void*) &offy); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_sign_with_offset_ocl(const int N, const float* X, const int offx, float* Y, const int offy); +template void caffe_gpu_sign_with_offset_ocl(const int N, const double* X, const int offx, double* Y, const int offy); + + +template +void caffe_gpu_abs_ocl(const int N, const Dtype* X, Dtype * Y) { + std::string kernel_name = "caffe_gpu_abs" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &N); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &X); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &Y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) N }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_abs_ocl(const int N, const float* X, float* Y); +template void caffe_gpu_abs_ocl(const int N, const double* X, + double* Y); + +template +void caffe_gpu_div(const int n, const Dtype* a, const Dtype* b, Dtype* y) { + std::string kernel_name = "div" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_div(const int n, const float* a, const float* b, + float* y); +template void caffe_gpu_div(const int n, const double* a, + const double* b, double* y); + +template +void caffe_gpu_add_scalar(const int n, const Dtype alpha, Dtype* top_data) { + std::string kernel_name = "add_scalar" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(Dtype), (void*) &alpha); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_add_scalar(const int n, const float alpha, + float* top_data); +template void caffe_gpu_add_scalar(const int n, const double alpha, + double* top_data); + +template +void caffe_gpu_mul(const int n, const Dtype* a, const Dtype* b, Dtype* y) { + std::string kernel_name = "element_mul" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(cl_mem), (void*) &b); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_mul(const int n, const float* a, const float* b, + float* y); +template void caffe_gpu_mul(const int n, const double* a, + const double* b, double* y); + +template +void caffe_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y) { + std::string kernel_name = "powx" + get_dtype_suffix(); + cl_kernel Kernel = amdDevice.GetKernel(kernel_name); + cl_int ret; + ret = clSetKernelArg(Kernel, 0, sizeof(cl_int), (void*) &n); + ret |= clSetKernelArg(Kernel, 1, sizeof(cl_mem), (void*) &a); + ret |= clSetKernelArg(Kernel, 2, sizeof(Dtype), (void*) &alpha); + ret |= clSetKernelArg(Kernel, 3, sizeof(cl_mem), (void*) &y); + OCL_CHECK(ret); + size_t Global_Work_Size[] = { (size_t) n }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, Kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void caffe_gpu_powx(const int n, const float* a, + const float alpha, float* y); +template void caffe_gpu_powx(const int n, const double* a, + const double alpha, double* y); + +template +void DropoutForward(const int count, const Dtype* bottom_data, + const unsigned int* MaskMem, const unsigned int threshold, + const float scale_, Dtype* top_data) { + std::string kernel_name = "DropoutForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &threshold); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void*) &scale_); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK(clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} + +template void DropoutForward(const int count, const float* bottom_data, + const unsigned int* MaskMem, const unsigned int threshold, + const float scale_, float* top_data); +template void DropoutForward(const int count, const double* bottom_data, + const unsigned int* MaskMem, const unsigned int threshold, + const float scale_, double* top_data); + +template +void DropoutBackward(const int count, const Dtype* top_diff, const unsigned int* MaskMem, + const unsigned int threshold_, const float scale_, Dtype* bottom_diff) { + std::string kernel_name = "DropoutBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &MaskMem); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &threshold_); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void*) &scale_); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void DropoutBackward(const int count, const float* top_diff, + const unsigned int* MaskMem, const unsigned int threshold_, const float scale_, + float* bottom_diff); +template void DropoutBackward(const int count, const double* top_diff, + const unsigned int* MaskMem, const unsigned int threshold_, const float scale_, + double* bottom_diff); + +template +void BNLLForward(const int count, const Dtype* bottom_data, Dtype *top_data) { + std::string kernel_name = "BNLLForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &top_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void BNLLForward(const int count, const float* bottom_data, + float *top_data); +template void BNLLForward(const int count, const double* bottom_data, + double *top_data); + +template +void BNLLBackward(const int count, const Dtype* top_diff, + const Dtype* bottom_data, Dtype *bottom_diff) { + std::string kernel_name = "BNLLBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void BNLLBackward(const int count, const float* top_diff, + const float* bottom_data, float *bottom_diff); +template void BNLLBackward(const int count, const double* top_diff, + const double* bottom_data, double *bottom_diff); + +template +void Concat(const int nthreads, const Dtype* in_data, const bool forward, + const int num_concats, const int concat_size, const int top_concat_axis, + const int bottom_concat_axis, const int offset_concat_axis, + Dtype *out_data) { + std::string kernel_name = "Concat" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + int k_forward = (forward == true) ? 1 : 0; + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_concats); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &concat_size); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &top_concat_axis); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &bottom_concat_axis); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_concat_axis); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void Concat(const int nthreads, const float* in_data, + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, float *out_data); +template void Concat(const int nthreads, const double* in_data, + const bool forward, const int num_concats, const int concat_size, + const int top_concat_axis, const int bottom_concat_axis, + const int offset_concat_axis, double *out_data); + +template +void CLLBackward(const int count, const int channels, const Dtype margin, + const bool legacy_version, const Dtype alpha, const Dtype* y, + const Dtype* diff, const Dtype* dist_sq, Dtype *bottom_diff) { + std::string kernel_name = "CLLBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &count); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_int), (void*) &channels); + ret |= clSetKernelArg(kernel, 2, sizeof(Dtype), (void*) &margin); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_bool), (void*) &legacy_version); + ret |= clSetKernelArg(kernel, 4, sizeof(Dtype), (void*) &alpha); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &y); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*) &diff); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*) &dist_sq); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) count }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void CLLBackward(const int count, const int channels, + const float margin, const bool legacy_version, const float alpha, + const float* y, const float* diff, const float* dist_sq, + float *bottom_diff); +template void CLLBackward(const int count, const int channels, + const double margin, const bool legacy_version, const double alpha, + const double* y, const double* diff, const double* dist_sq, + double *bottom_diff); + +template +void MaxForward(const int nthreads, const Dtype* bottom_data_a, + const Dtype* bottom_data_b, const int blob_idx, Dtype* top_data, + int* mask) { + std::string kernel_name = "MaxForward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &bottom_data_a); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &bottom_data_b); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &blob_idx); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &top_data); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &mask); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void MaxForward(const int nthreads, const float* bottom_data_a, + const float* bottom_data_b, const int blob_idx, float* top_data, int* mask); +template void MaxForward(const int nthreads, + const double* bottom_data_a, const double* bottom_data_b, + const int blob_idx, double* top_data, int* mask); + +template +void MaxBackward(const int nthreads, const Dtype* top_diff, const int blob_idx, + const int* mask, Dtype* bottom_diff) { + std::string kernel_name = "MaxBackward" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &top_diff); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &blob_idx); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &mask); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &bottom_diff); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void MaxBackward(const int nthreads, const float* top_diff, + const int blob_idx, const int* mask, float* bottom_diff); +template void MaxBackward(const int nthreads, const double* top_diff, + const int blob_idx, const int* mask, double* bottom_diff); + +template +void Slice(const int nthreads, const Dtype* in_data, + const bool forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, Dtype* out_data) { + std::string kernel_name = "Slice" + get_dtype_suffix(); + cl_kernel kernel = amdDevice.GetKernel(kernel_name); + int k_forward = (forward == true) ? 1 : 0; + cl_int ret; + ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*) &nthreads); + ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &in_data); + ret |= clSetKernelArg(kernel, 2, sizeof(cl_int), (void*) &k_forward); + ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void*) &num_slices); + ret |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void*) &slice_size); + ret |= clSetKernelArg(kernel, 5, sizeof(cl_int), (void*) &bottom_slice_axis); + ret |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*) &top_slice_axis); + ret |= clSetKernelArg(kernel, 7, sizeof(cl_int), (void*) &offset_slice_axis); + ret |= clSetKernelArg(kernel, 8, sizeof(cl_mem), (void*) &out_data); + OCL_CHECK(ret); + + size_t Global_Work_Size[] = { (size_t) nthreads }; + size_t Local_Work_Size[] = { 256 }; + OCL_CHECK( + clEnqueueNDRangeKernel(amdDevice.CommandQueue, kernel, 1, NULL, + Global_Work_Size, Local_Work_Size, 0, NULL, NULL)); +} +template void Slice(const int nthreads, const float* in_data, + const bool forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, float* out_data); +template void Slice(const int nthreads, const double* in_data, + const bool forward, const int num_slices, const int slice_size, + const int bottom_slice_axis, const int top_slice_axis, + const int offset_slice_axis, double* out_data); + +template +void ocl_conv(Dtype* bottom_data, Dtype* top_data, Dtype* weights, Dtype* bias, + int channel_in, int width, int height, int channel_out, int width_out, + int height_out, int kernel_w, int kernel_h, int stride, int pad, + int batch_sz) { +} +template void ocl_conv(float* bottom_data, float* top_data, + float* weights, float* bias, int channel_in, int width, int height, + int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, + int stride, int pad, int batch_sz); +template void ocl_conv(double* bottom_data, double* top_data, + double* weights, double* bias, int channel_in, int width, int height, + int channel_out, int width_out, int height_out, int kernel_w, int kernel_h, + int stride, int pad, int batch_sz); + +#endif + +} // namespace caffe + diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp index 38a06026..028dd884 100644 --- a/src/caffe/util/upgrade_proto.cpp +++ b/src/caffe/util/upgrade_proto.cpp @@ -30,7 +30,7 @@ bool NetNeedsV1ToV2Upgrade(const NetParameter& net_param) { } bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers, - NetParameter* net_param) { + NetParameter* net_param) { // First upgrade padding layers to padded conv layers. NetParameter v0_net_param; UpgradeV0PaddingLayers(v0_net_param_padding_layers, &v0_net_param); @@ -42,7 +42,7 @@ bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers, } for (int i = 0; i < v0_net_param.layers_size(); ++i) { is_fully_compatible &= UpgradeV0LayerParameter(v0_net_param.layers(i), - net_param->add_layers()); + net_param->add_layers()); } for (int i = 0; i < v0_net_param.input_size(); ++i) { net_param->add_input(v0_net_param.input(i)); @@ -57,7 +57,7 @@ bool UpgradeV0Net(const NetParameter& v0_net_param_padding_layers, } void UpgradeV0PaddingLayers(const NetParameter& param, - NetParameter* param_upgraded_pad) { + NetParameter* param_upgraded_pad) { // Copy everything other than the layers from the original param. param_upgraded_pad->Clear(); param_upgraded_pad->CopyFrom(param); @@ -77,8 +77,8 @@ void UpgradeV0PaddingLayers(const NetParameter& param, } for (int j = 0; j < layer_connection.bottom_size(); ++j) { const string& blob_name = layer_connection.bottom(j); - if (blob_name_to_last_top_idx.find(blob_name) == - blob_name_to_last_top_idx.end()) { + if (blob_name_to_last_top_idx.find(blob_name) + == blob_name_to_last_top_idx.end()) { LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j; } const int top_idx = blob_name_to_last_top_idx[blob_name]; @@ -93,7 +93,7 @@ void UpgradeV0PaddingLayers(const NetParameter& param, // cases have undefined behavior in Caffe. CHECK((layer_param.type() == "conv") || (layer_param.type() == "pool")) << "Padding layer input to " - "non-convolutional / non-pooling layer type " + "non-convolutional / non-pooling layer type " << layer_param.type(); CHECK_EQ(layer_connection.bottom_size(), 1) << "Conv Layer takes a single blob as input."; @@ -102,10 +102,10 @@ void UpgradeV0PaddingLayers(const NetParameter& param, CHECK_EQ(source_layer.top_size(), 1) << "Padding Layer produces a single blob as output."; int layer_index = param_upgraded_pad->layers_size() - 1; - param_upgraded_pad->mutable_layers(layer_index)->mutable_layer() - ->set_pad(source_layer.layer().pad()); - param_upgraded_pad->mutable_layers(layer_index) - ->set_bottom(j, source_layer.bottom(0)); + param_upgraded_pad->mutable_layers(layer_index)->mutable_layer()->set_pad( + source_layer.layer().pad()); + param_upgraded_pad->mutable_layers(layer_index)->set_bottom(j, + source_layer.bottom(0)); } } for (int j = 0; j < layer_connection.top_size(); ++j) { @@ -116,7 +116,7 @@ void UpgradeV0PaddingLayers(const NetParameter& param, } bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, - V1LayerParameter* layer_param) { + V1LayerParameter* layer_param) { bool is_fully_compatible = true; layer_param->Clear(); for (int i = 0; i < v0_layer_connection.bottom_size(); ++i) { @@ -169,11 +169,11 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, } if (v0_layer_param.has_weight_filler()) { if (type == "conv") { - layer_param->mutable_convolution_param()-> - mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); + layer_param->mutable_convolution_param()->mutable_weight_filler()->CopyFrom( + v0_layer_param.weight_filler()); } else if (type == "innerproduct") { - layer_param->mutable_inner_product_param()-> - mutable_weight_filler()->CopyFrom(v0_layer_param.weight_filler()); + layer_param->mutable_inner_product_param()->mutable_weight_filler()->CopyFrom( + v0_layer_param.weight_filler()); } else { LOG(ERROR) << "Unknown parameter weight_filler for layer type " << type; is_fully_compatible = false; @@ -181,11 +181,11 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, } if (v0_layer_param.has_bias_filler()) { if (type == "conv") { - layer_param->mutable_convolution_param()-> - mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); + layer_param->mutable_convolution_param()->mutable_bias_filler()->CopyFrom( + v0_layer_param.bias_filler()); } else if (type == "innerproduct") { - layer_param->mutable_inner_product_param()-> - mutable_bias_filler()->CopyFrom(v0_layer_param.bias_filler()); + layer_param->mutable_inner_product_param()->mutable_bias_filler()->CopyFrom( + v0_layer_param.bias_filler()); } else { LOG(ERROR) << "Unknown parameter bias_filler for layer type " << type; is_fully_compatible = false; @@ -322,12 +322,11 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, } } if (v0_layer_param.has_scale()) { - layer_param->mutable_transform_param()-> - set_scale(v0_layer_param.scale()); + layer_param->mutable_transform_param()->set_scale(v0_layer_param.scale()); } if (v0_layer_param.has_meanfile()) { - layer_param->mutable_transform_param()-> - set_mean_file(v0_layer_param.meanfile()); + layer_param->mutable_transform_param()->set_mean_file( + v0_layer_param.meanfile()); } if (v0_layer_param.has_batchsize()) { if (type == "data") { @@ -348,12 +347,12 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, } } if (v0_layer_param.has_cropsize()) { - layer_param->mutable_transform_param()-> - set_crop_size(v0_layer_param.cropsize()); + layer_param->mutable_transform_param()->set_crop_size( + v0_layer_param.cropsize()); } if (v0_layer_param.has_mirror()) { - layer_param->mutable_transform_param()-> - set_mirror(v0_layer_param.mirror()); + layer_param->mutable_transform_param()->set_mirror( + v0_layer_param.mirror()); } if (v0_layer_param.has_rand_skip()) { if (type == "data") { @@ -409,7 +408,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, v0_layer_param.det_fg_threshold()); } else { LOG(ERROR) << "Unknown parameter det_fg_threshold for layer type " - << type; + << type; is_fully_compatible = false; } } @@ -419,7 +418,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, v0_layer_param.det_bg_threshold()); } else { LOG(ERROR) << "Unknown parameter det_bg_threshold for layer type " - << type; + << type; is_fully_compatible = false; } } @@ -429,7 +428,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, v0_layer_param.det_fg_fraction()); } else { LOG(ERROR) << "Unknown parameter det_fg_fraction for layer type " - << type; + << type; is_fully_compatible = false; } } @@ -439,7 +438,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, v0_layer_param.det_context_pad()); } else { LOG(ERROR) << "Unknown parameter det_context_pad for layer type " - << type; + << type; is_fully_compatible = false; } } @@ -448,8 +447,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, layer_param->mutable_window_data_param()->set_crop_mode( v0_layer_param.det_crop_mode()); } else { - LOG(ERROR) << "Unknown parameter det_crop_mode for layer type " - << type; + LOG(ERROR) << "Unknown parameter det_crop_mode for layer type " << type; is_fully_compatible = false; } } @@ -459,7 +457,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection, v0_layer_param.hdf5_output_param()); } else { LOG(ERROR) << "Unknown parameter hdf5_output_param for layer type " - << type; + << type; is_fully_compatible = false; } } @@ -526,24 +524,48 @@ bool NetNeedsDataUpgrade(const NetParameter& net_param) { for (int i = 0; i < net_param.layers_size(); ++i) { if (net_param.layers(i).type() == V1LayerParameter_LayerType_DATA) { DataParameter layer_param = net_param.layers(i).data_param(); - if (layer_param.has_scale()) { return true; } - if (layer_param.has_mean_file()) { return true; } - if (layer_param.has_crop_size()) { return true; } - if (layer_param.has_mirror()) { return true; } + if (layer_param.has_scale()) { + return true; + } + if (layer_param.has_mean_file()) { + return true; + } + if (layer_param.has_crop_size()) { + return true; + } + if (layer_param.has_mirror()) { + return true; + } } if (net_param.layers(i).type() == V1LayerParameter_LayerType_IMAGE_DATA) { ImageDataParameter layer_param = net_param.layers(i).image_data_param(); - if (layer_param.has_scale()) { return true; } - if (layer_param.has_mean_file()) { return true; } - if (layer_param.has_crop_size()) { return true; } - if (layer_param.has_mirror()) { return true; } + if (layer_param.has_scale()) { + return true; + } + if (layer_param.has_mean_file()) { + return true; + } + if (layer_param.has_crop_size()) { + return true; + } + if (layer_param.has_mirror()) { + return true; + } } if (net_param.layers(i).type() == V1LayerParameter_LayerType_WINDOW_DATA) { WindowDataParameter layer_param = net_param.layers(i).window_data_param(); - if (layer_param.has_scale()) { return true; } - if (layer_param.has_mean_file()) { return true; } - if (layer_param.has_crop_size()) { return true; } - if (layer_param.has_mirror()) { return true; } + if (layer_param.has_scale()) { + return true; + } + if (layer_param.has_mean_file()) { + return true; + } + if (layer_param.has_crop_size()) { + return true; + } + if (layer_param.has_mirror()) { + return true; + } } } return false; @@ -589,7 +611,7 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) { // NetParameter was specified using the old style (V0LayerParameter); try to // upgrade it. LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "V0LayerParameter: " << param_file; + << "V0LayerParameter: " << param_file; NetParameter original_param(*param); if (!UpgradeV0Net(original_param, param)) { success = false; @@ -597,7 +619,7 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) { << "V0NetParameter to NetParameter (see above); continuing anyway."; } else { LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "V0LayerParameter"; + << "V0LayerParameter"; } LOG(ERROR) << "Note that future Caffe releases will not support " << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for " @@ -607,16 +629,16 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) { // NetParameter uses old style data transformation fields; try to upgrade it. if (NetNeedsDataUpgrade(*param)) { LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "transformation parameters: " << param_file; + << "transformation parameters: " << param_file; UpgradeNetDataTransformation(param); LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "data transformation parameters."; + << "data transformation parameters."; LOG(ERROR) << "Note that future Caffe releases will only support " - << "transform_param messages for transformation fields."; + << "transform_param messages for transformation fields."; } if (NetNeedsV1ToV2Upgrade(*param)) { LOG(ERROR) << "Attempting to upgrade input file specified using deprecated " - << "V1LayerParameter: " << param_file; + << "V1LayerParameter: " << param_file; NetParameter original_param(*param); if (!UpgradeV1Net(original_param, param)) { success = false; @@ -624,7 +646,7 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) { << "V1LayerParameter (see above); continuing anyway."; } else { LOG(INFO) << "Successfully upgraded file specified using deprecated " - << "V1LayerParameter"; + << "V1LayerParameter"; } } return success; @@ -634,7 +656,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) { bool is_fully_compatible = true; if (v1_net_param.layer_size() > 0) { LOG(ERROR) << "Input NetParameter to be upgraded already specifies 'layer' " - << "fields; these will be ignored for the upgrade."; + << "fields; these will be ignored for the upgrade."; is_fully_compatible = false; } net_param->CopyFrom(v1_net_param); @@ -642,7 +664,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) { net_param->clear_layer(); for (int i = 0; i < v1_net_param.layers_size(); ++i) { if (!UpgradeV1LayerParameter(v1_net_param.layers(i), - net_param->add_layer())) { + net_param->add_layer())) { LOG(ERROR) << "Upgrade of input layer " << i << " failed."; is_fully_compatible = false; } @@ -651,7 +673,7 @@ bool UpgradeV1Net(const NetParameter& v1_net_param, NetParameter* net_param) { } bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, - LayerParameter* layer_param) { + LayerParameter* layer_param) { layer_param->Clear(); bool is_fully_compatible = true; for (int i = 0; i < v1_layer_param.bottom_size(); ++i) { @@ -676,12 +698,16 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, layer_param->add_blobs()->CopyFrom(v1_layer_param.blobs(i)); } for (int i = 0; i < v1_layer_param.param_size(); ++i) { - while (layer_param->param_size() <= i) { layer_param->add_param(); } + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } layer_param->mutable_param(i)->set_name(v1_layer_param.param(i)); } ParamSpec_DimCheckMode mode; for (int i = 0; i < v1_layer_param.blob_share_mode_size(); ++i) { - while (layer_param->param_size() <= i) { layer_param->add_param(); } + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } switch (v1_layer_param.blob_share_mode(i)) { case V1LayerParameter_DimCheckMode_STRICT: mode = ParamSpec_DimCheckMode_STRICT; @@ -691,17 +717,21 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, break; default: LOG(FATAL) << "Unknown blob_share_mode: " - << v1_layer_param.blob_share_mode(i); + << v1_layer_param.blob_share_mode(i); break; } layer_param->mutable_param(i)->set_share_mode(mode); } for (int i = 0; i < v1_layer_param.blobs_lr_size(); ++i) { - while (layer_param->param_size() <= i) { layer_param->add_param(); } + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } layer_param->mutable_param(i)->set_lr_mult(v1_layer_param.blobs_lr(i)); } for (int i = 0; i < v1_layer_param.weight_decay_size(); ++i) { - while (layer_param->param_size() <= i) { layer_param->add_param(); } + while (layer_param->param_size() <= i) { + layer_param->add_param(); + } layer_param->mutable_param(i)->set_decay_mult( v1_layer_param.weight_decay(i)); } @@ -729,8 +759,7 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, v1_layer_param.convolution_param()); } if (v1_layer_param.has_data_param()) { - layer_param->mutable_data_param()->CopyFrom( - v1_layer_param.data_param()); + layer_param->mutable_data_param()->CopyFrom(v1_layer_param.data_param()); } if (v1_layer_param.has_dropout_param()) { layer_param->mutable_dropout_param()->CopyFrom( @@ -745,8 +774,7 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, v1_layer_param.eltwise_param()); } if (v1_layer_param.has_exp_param()) { - layer_param->mutable_exp_param()->CopyFrom( - v1_layer_param.exp_param()); + layer_param->mutable_exp_param()->CopyFrom(v1_layer_param.exp_param()); } if (v1_layer_param.has_hdf5_data_param()) { layer_param->mutable_hdf5_data_param()->CopyFrom( @@ -773,28 +801,24 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, v1_layer_param.inner_product_param()); } if (v1_layer_param.has_lrn_param()) { - layer_param->mutable_lrn_param()->CopyFrom( - v1_layer_param.lrn_param()); + layer_param->mutable_lrn_param()->CopyFrom(v1_layer_param.lrn_param()); } if (v1_layer_param.has_memory_data_param()) { layer_param->mutable_memory_data_param()->CopyFrom( v1_layer_param.memory_data_param()); } if (v1_layer_param.has_mvn_param()) { - layer_param->mutable_mvn_param()->CopyFrom( - v1_layer_param.mvn_param()); + layer_param->mutable_mvn_param()->CopyFrom(v1_layer_param.mvn_param()); } if (v1_layer_param.has_pooling_param()) { layer_param->mutable_pooling_param()->CopyFrom( v1_layer_param.pooling_param()); } if (v1_layer_param.has_power_param()) { - layer_param->mutable_power_param()->CopyFrom( - v1_layer_param.power_param()); + layer_param->mutable_power_param()->CopyFrom(v1_layer_param.power_param()); } if (v1_layer_param.has_relu_param()) { - layer_param->mutable_relu_param()->CopyFrom( - v1_layer_param.relu_param()); + layer_param->mutable_relu_param()->CopyFrom(v1_layer_param.relu_param()); } if (v1_layer_param.has_sigmoid_param()) { layer_param->mutable_sigmoid_param()->CopyFrom( @@ -805,12 +829,10 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, v1_layer_param.softmax_param()); } if (v1_layer_param.has_slice_param()) { - layer_param->mutable_slice_param()->CopyFrom( - v1_layer_param.slice_param()); + layer_param->mutable_slice_param()->CopyFrom(v1_layer_param.slice_param()); } if (v1_layer_param.has_tanh_param()) { - layer_param->mutable_tanh_param()->CopyFrom( - v1_layer_param.tanh_param()); + layer_param->mutable_tanh_param()->CopyFrom(v1_layer_param.tanh_param()); } if (v1_layer_param.has_threshold_param()) { layer_param->mutable_threshold_param()->CopyFrom( @@ -825,8 +847,7 @@ bool UpgradeV1LayerParameter(const V1LayerParameter& v1_layer_param, v1_layer_param.transform_param()); } if (v1_layer_param.has_loss_param()) { - layer_param->mutable_loss_param()->CopyFrom( - v1_layer_param.loss_param()); + layer_param->mutable_loss_param()->CopyFrom(v1_layer_param.loss_param()); } if (v1_layer_param.has_layer()) { LOG(ERROR) << "Input NetParameter has V0 layer -- ignoring."; @@ -924,14 +945,14 @@ const char* UpgradeV1LayerType(const V1LayerParameter_LayerType type) { } void ReadNetParamsFromTextFileOrDie(const string& param_file, - NetParameter* param) { + NetParameter* param) { CHECK(ReadProtoFromTextFile(param_file, param)) << "Failed to parse NetParameter file: " << param_file; UpgradeNetAsNeeded(param_file, param); } void ReadNetParamsFromBinaryFileOrDie(const string& param_file, - NetParameter* param) { + NetParameter* param) { CHECK(ReadProtoFromBinaryFile(param_file, param)) << "Failed to parse NetParameter file: " << param_file; UpgradeNetAsNeeded(param_file, param); diff --git a/tools/caffe.cpp b/tools/caffe.cpp index 0b7523fc..79b8e127 100644 --- a/tools/caffe.cpp +++ b/tools/caffe.cpp @@ -7,6 +7,7 @@ #include "boost/algorithm/string.hpp" #include "caffe/caffe.hpp" +#include "caffe/device.hpp" using caffe::Blob; using caffe::Caffe; @@ -15,7 +16,9 @@ using caffe::Layer; using caffe::shared_ptr; using caffe::Timer; using caffe::vector; - +#ifndef CPU_ONLY +using caffe::amdDevice; +#endif DEFINE_int32(gpu, -1, "Run in GPU mode on given device ID."); @@ -117,7 +120,7 @@ int train() { LOG(INFO) << "Use CPU."; Caffe::set_mode(Caffe::CPU); } - + LOG(INFO) << "Starting Optimization"; shared_ptr > solver(caffe::GetSolver(solver_param)); @@ -246,6 +249,9 @@ int time() { std::vector backward_time_per_layer(layers.size(), 0.0); double forward_time = 0.0; double backward_time = 0.0; +#ifndef CPU_ONLY + clFinish(amdDevice.CommandQueue); +#endif for (int j = 0; j < FLAGS_iterations; ++j) { Timer iter_timer; iter_timer.Start(); @@ -253,6 +259,9 @@ int time() { for (int i = 0; i < layers.size(); ++i) { timer.Start(); layers[i]->Forward(bottom_vecs[i], top_vecs[i]); +#ifndef CPU_ONLY + clFinish(amdDevice.CommandQueue); +#endif forward_time_per_layer[i] += timer.MicroSeconds(); } forward_time += forward_timer.MicroSeconds(); @@ -261,6 +270,9 @@ int time() { timer.Start(); layers[i]->Backward(top_vecs[i], bottom_need_backward[i], bottom_vecs[i]); +#ifndef CPU_ONLY + clFinish(amdDevice.CommandQueue); +#endif backward_time_per_layer[i] += timer.MicroSeconds(); } backward_time += backward_timer.MicroSeconds(); @@ -291,8 +303,9 @@ int time() { RegisterBrewFunction(time); int main(int argc, char** argv) { + FLAGS_log_dir = "./log/"; // Print output to stderr (while still logging). - FLAGS_alsologtostderr = 1; + FLAGS_alsologtostderr = 0; // Usage message. gflags::SetUsageMessage("command line brew\n" "usage: caffe \n\n"